summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2017-04-07 13:27:42 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2017-04-07 13:27:42 +1000
commit6c76650b6e643ed2665c72305c47f08f0babd07a (patch)
tree0e9dc0ac01586e3a0342a9e09eec0f969f1efb54
parent6b382d660242a32d063d0a7e5200c3e5e702c243 (diff)
parent71c4271c9a58a75e2b6f0ffa81292befdfb02e93 (diff)
downloadlinux-next-6c76650b6e643ed2665c72305c47f08f0babd07a.tar.gz
Merge remote-tracking branch 'tip/auto-latest'
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt7
-rw-r--r--Documentation/trace/kprobetrace.txt5
-rw-r--r--Documentation/usb/usb3-debug-port.rst100
-rw-r--r--Documentation/x86/x86_64/mm.txt36
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/Kconfig7
-rw-r--r--arch/arm/Kconfig2
-rw-r--r--arch/arm/include/asm/efi.h14
-rw-r--r--arch/arm/kernel/vmlinux-xip.lds.S2
-rw-r--r--arch/arm/kernel/vmlinux.lds.S2
-rw-r--r--arch/arm64/Kconfig2
-rw-r--r--arch/arm64/include/asm/bug.h3
-rw-r--r--arch/arm64/include/asm/efi.h24
-rw-r--r--arch/arm64/kernel/acpi.c3
-rw-r--r--arch/avr32/kernel/vmlinux.lds.S1
-rw-r--r--arch/blackfin/kernel/vmlinux.lds.S2
-rw-r--r--arch/c6x/kernel/vmlinux.lds.S2
-rw-r--r--arch/cris/kernel/vmlinux.lds.S2
-rw-r--r--arch/frv/kernel/vmlinux.lds.S2
-rw-r--r--arch/ia64/kernel/vmlinux.lds.S2
-rw-r--r--arch/mips/kernel/vmlinux.lds.S1
-rw-r--r--arch/parisc/include/asm/bug.h8
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/include/asm/bug.h4
-rw-r--r--arch/powerpc/include/asm/mmu_context.h6
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S2
-rw-r--r--arch/s390/include/asm/bug.h4
-rw-r--r--arch/s390/include/asm/mmu_context.h6
-rw-r--r--arch/sh/include/asm/bug.h4
-rw-r--r--arch/um/Kconfig.common5
-rw-r--r--arch/um/include/asm/mmu_context.h6
-rw-r--r--arch/um/include/shared/os.h4
-rw-r--r--arch/unicore32/include/asm/mmu_context.h6
-rw-r--r--arch/x86/Kconfig17
-rw-r--r--arch/x86/Kconfig.debug27
-rw-r--r--arch/x86/boot/cpucheck.c9
-rw-r--r--arch/x86/boot/cpuflags.c12
-rw-r--r--arch/x86/entry/entry_32.S171
-rw-r--r--arch/x86/entry/entry_64.S22
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c24
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S3
-rw-r--r--arch/x86/entry/vdso/vdso2c.c3
-rw-r--r--arch/x86/entry/vdso/vma.c9
-rw-r--r--arch/x86/events/amd/iommu.c325
-rw-r--r--arch/x86/events/amd/iommu.h18
-rw-r--r--arch/x86/events/intel/bts.c16
-rw-r--r--arch/x86/events/intel/core.c22
-rw-r--r--arch/x86/events/intel/pt.c129
-rw-r--r--arch/x86/events/intel/pt.h2
-rw-r--r--arch/x86/hyperv/hv_init.c48
-rw-r--r--arch/x86/include/asm/apic.h7
-rw-r--r--arch/x86/include/asm/atomic.h33
-rw-r--r--arch/x86/include/asm/atomic64_64.h46
-rw-r--r--arch/x86/include/asm/bug.h80
-rw-r--r--arch/x86/include/asm/clocksource.h3
-rw-r--r--arch/x86/include/asm/cmpxchg.h70
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/desc.h147
-rw-r--r--arch/x86/include/asm/disabled-features.h8
-rw-r--r--arch/x86/include/asm/elf.h28
-rw-r--r--arch/x86/include/asm/fixmap.h4
-rw-r--r--arch/x86/include/asm/kasan.h9
-rw-r--r--arch/x86/include/asm/kexec.h1
-rw-r--r--arch/x86/include/asm/mce.h12
-rw-r--r--arch/x86/include/asm/mmu_context.h16
-rw-r--r--arch/x86/include/asm/mshyperv.h54
-rw-r--r--arch/x86/include/asm/msr-index.h11
-rw-r--r--arch/x86/include/asm/page_64.h16
-rw-r--r--arch/x86/include/asm/page_64_types.h10
-rw-r--r--arch/x86/include/asm/paravirt.h54
-rw-r--r--arch/x86/include/asm/paravirt_types.h17
-rw-r--r--arch/x86/include/asm/pgalloc.h37
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable-3level.h47
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable.h137
-rw-r--r--arch/x86/include/asm/pgtable_32.h1
-rw-r--r--arch/x86/include/asm/pgtable_64.h39
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h32
-rw-r--r--arch/x86/include/asm/pgtable_types.h46
-rw-r--r--arch/x86/include/asm/processor.h19
-rw-r--r--arch/x86/include/asm/proto.h4
-rw-r--r--arch/x86/include/asm/reboot.h1
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/sparsemem.h9
-rw-r--r--arch/x86/include/asm/stackprotector.h2
-rw-r--r--arch/x86/include/asm/thread_info.h6
-rw-r--r--arch/x86/include/asm/tlbflush.h10
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h82
-rw-r--r--arch/x86/include/asm/vdso.h1
-rw-r--r--arch/x86/include/asm/xen/page.h8
-rw-r--r--arch/x86/include/uapi/asm/prctl.h11
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/acpi/boot.c6
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/apic/apic.c6
-rw-r--r--arch/x86/kernel/apm_32.c6
-rw-r--r--arch/x86/kernel/cpu/common.c59
-rw-r--r--arch/x86/kernel/cpu/intel.c40
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/dev-mcelog.c397
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c554
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c2
-rw-r--r--arch/x86/kernel/cpu/proc.c5
-rw-r--r--arch/x86/kernel/dumpstack.c3
-rw-r--r--arch/x86/kernel/dumpstack_32.c12
-rw-r--r--arch/x86/kernel/dumpstack_64.c10
-rw-r--r--arch/x86/kernel/early_printk.c5
-rw-r--r--arch/x86/kernel/espfix_64.c12
-rw-r--r--arch/x86/kernel/ftrace_32.S244
-rw-r--r--arch/x86/kernel/ftrace_64.S (renamed from arch/x86/kernel/mcount_64.S)6
-rw-r--r--arch/x86/kernel/head_64.S10
-rw-r--r--arch/x86/kernel/machine_kexec_32.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c14
-rw-r--r--arch/x86/kernel/paravirt.c13
-rw-r--r--arch/x86/kernel/pci-calgary_64.c5
-rw-r--r--arch/x86/kernel/process.c151
-rw-r--r--arch/x86/kernel/process_32.c7
-rw-r--r--arch/x86/kernel/process_64.c115
-rw-r--r--arch/x86/kernel/ptrace.c8
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/setup.c52
-rw-r--r--arch/x86/kernel/setup_percpu.c23
-rw-r--r--arch/x86/kernel/signal_compat.c4
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c15
-rw-r--r--arch/x86/kernel/tboot.c6
-rw-r--r--arch/x86/kernel/tls.c11
-rw-r--r--arch/x86/kernel/traps.c46
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kernel/vmlinux.lds.S1
-rw-r--r--arch/x86/kvm/svm.c4
-rw-r--r--arch/x86/kvm/vmx.c12
-rw-r--r--arch/x86/lib/clear_page_64.S17
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/dump_pagetables.c59
-rw-r--r--arch/x86/mm/fault.c66
-rw-r--r--arch/x86/mm/gup.c475
-rw-r--r--arch/x86/mm/hugetlbpage.c9
-rw-r--r--arch/x86/mm/ident_map.c51
-rw-r--r--arch/x86/mm/init_32.c71
-rw-r--r--arch/x86/mm/init_64.c183
-rw-r--r--arch/x86/mm/ioremap.c3
-rw-r--r--arch/x86/mm/kasan_init_64.c33
-rw-r--r--arch/x86/mm/mmap.c125
-rw-r--r--arch/x86/mm/numa.c25
-rw-r--r--arch/x86/mm/pageattr.c54
-rw-r--r--arch/x86/mm/pgtable.c36
-rw-r--r--arch/x86/mm/pgtable_32.c8
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi_32.c4
-rw-r--r--arch/x86/platform/efi/efi_64.c43
-rw-r--r--arch/x86/platform/intel-mid/device_libs/Makefile3
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bt.c108
-rw-r--r--arch/x86/platform/uv/tlb_uv.c195
-rw-r--r--arch/x86/power/cpu.c7
-rw-r--r--arch/x86/power/hibernate_32.c7
-rw-r--r--arch/x86/power/hibernate_64.c47
-rw-r--r--arch/x86/ras/Kconfig14
-rw-r--r--arch/x86/um/Makefile4
-rw-r--r--arch/x86/um/asm/ptrace.h2
-rw-r--r--arch/x86/um/bug.c21
-rw-r--r--arch/x86/um/os-Linux/prctl.c4
-rw-r--r--arch/x86/um/syscalls_32.c7
-rw-r--r--arch/x86/um/syscalls_64.c20
-rw-r--r--arch/x86/xen/enlighten.c3
-rw-r--r--arch/x86/xen/mmu.c398
-rw-r--r--arch/x86/xen/mmu.h1
-rw-r--r--arch/x86/xen/smp.c2
-rw-r--r--drivers/acpi/Kconfig2
-rw-r--r--drivers/acpi/bgrt.c6
-rw-r--r--drivers/clocksource/em_sti.c46
-rw-r--r--drivers/clocksource/h8300_timer8.c8
-rw-r--r--drivers/clocksource/sh_cmt.c45
-rw-r--r--drivers/clocksource/sh_tmu.c26
-rw-r--r--drivers/firmware/efi/Makefile1
-rw-r--r--drivers/firmware/efi/efi-bgrt.c (renamed from arch/x86/platform/efi/efi-bgrt.c)0
-rw-r--r--drivers/firmware/efi/efi-pstore.c6
-rw-r--r--drivers/firmware/efi/libstub/arm-stub.c80
-rw-r--r--drivers/firmware/efi/libstub/arm32-stub.c150
-rw-r--r--drivers/firmware/efi/libstub/arm64-stub.c4
-rw-r--r--drivers/firmware/efi/libstub/efi-stub-helper.c32
-rw-r--r--drivers/firmware/efi/libstub/efistub.h9
-rw-r--r--drivers/firmware/efi/libstub/fdt.c57
-rw-r--r--drivers/firmware/efi/libstub/gop.c6
-rw-r--r--drivers/firmware/efi/libstub/secureboot.c2
-rw-r--r--drivers/hv/Kconfig3
-rw-r--r--drivers/hwtracing/coresight/coresight-etb10.c9
-rw-r--r--drivers/hwtracing/coresight/coresight-etm-perf.c9
-rw-r--r--drivers/hwtracing/coresight/coresight-priv.h2
-rw-r--r--drivers/hwtracing/coresight/coresight-tmc-etf.c7
-rw-r--r--drivers/iommu/amd_iommu.c6
-rw-r--r--drivers/iommu/amd_iommu_init.c101
-rw-r--r--drivers/iommu/amd_iommu_proto.h8
-rw-r--r--drivers/iommu/amd_iommu_types.h3
-rw-r--r--drivers/lguest/x86/core.c6
-rw-r--r--drivers/pnp/pnpbios/bioscalls.c10
-rw-r--r--drivers/ras/Makefile3
-rw-r--r--drivers/ras/cec.c532
-rw-r--r--drivers/ras/debugfs.c2
-rw-r--r--drivers/ras/debugfs.h8
-rw-r--r--drivers/ras/ras.c11
-rw-r--r--drivers/usb/Makefile2
-rw-r--r--drivers/usb/early/Makefile1
-rw-r--r--drivers/usb/early/xhci-dbc.c1014
-rw-r--r--drivers/usb/early/xhci-dbc.h211
-rw-r--r--drivers/usb/serial/usb_debug.c28
-rw-r--r--drivers/video/fbdev/efifb.c66
-rw-r--r--fs/exec.c1
-rw-r--r--include/asm-generic/bug.h18
-rw-r--r--include/asm-generic/mm_hooks.h6
-rw-r--r--include/asm-generic/pgtable.h25
-rw-r--r--include/asm-generic/vmlinux.lds.h5
-rw-r--r--include/linux/atomic.h46
-rw-r--r--include/linux/bug.h2
-rw-r--r--include/linux/clockchips.h1
-rw-r--r--include/linux/compat.h2
-rw-r--r--include/linux/coresight.h2
-rw-r--r--include/linux/efi-bgrt.h5
-rw-r--r--include/linux/efi.h5
-rw-r--r--include/linux/hrtimer.h4
-rw-r--r--include/linux/init_task.h1
-rw-r--r--include/linux/kprobes.h2
-rw-r--r--include/linux/lockdep.h3
-rw-r--r--include/linux/mm.h4
-rw-r--r--include/linux/mm_types.h5
-rw-r--r--include/linux/module.h6
-rw-r--r--include/linux/pagemap.h4
-rw-r--r--include/linux/percpu.h1
-rw-r--r--include/linux/perf_event.h17
-rw-r--r--include/linux/ras.h13
-rw-r--r--include/linux/refcount.h19
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/sched/rt.h23
-rw-r--r--include/linux/smp.h12
-rw-r--r--include/linux/thread_info.h4
-rw-r--r--include/linux/usb/xhci-dbgp.h29
-rw-r--r--include/trace/events/sched.h16
-rw-r--r--include/trace/events/xen.h28
-rw-r--r--include/uapi/linux/perf_event.h33
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/events/core.c139
-rw-r--r--kernel/events/ring_buffer.c34
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/futex.c502
-rw-r--r--kernel/irq/affinity.c20
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/kprobes.c46
-rw-r--r--kernel/locking/lockdep.c242
-rw-r--r--kernel/locking/rtmutex-debug.c9
-rw-r--r--kernel/locking/rtmutex-debug.h3
-rw-r--r--kernel/locking/rtmutex.c390
-rw-r--r--kernel/locking/rtmutex.h2
-rw-r--r--kernel/locking/rtmutex_common.h25
-rw-r--r--kernel/locking/rwsem.c6
-rw-r--r--kernel/locking/test-ww_mutex.c29
-rw-r--r--kernel/module.c42
-rw-r--r--kernel/nsproxy.c3
-rw-r--r--kernel/sched/core.c263
-rw-r--r--kernel/sched/fair.c385
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/rt.c81
-rw-r--r--kernel/sched/sched.h65
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/hrtimer.c5
-rw-r--r--kernel/time/posix-cpu-timers.c9
-rw-r--r--kernel/time/sched_clock.c5
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/time/timer_list.c6
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/trace.c1
-rw-r--r--kernel/trace/trace_kprobe.c9
-rw-r--r--lib/bug.c28
-rw-r--r--lib/refcount.c169
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/gup.c158
-rw-r--r--mm/percpu.c40
-rwxr-xr-xscripts/checksyscalls.sh1
-rw-r--r--tools/arch/arm/include/uapi/asm/kvm.h13
-rw-r--r--tools/arch/arm64/include/uapi/asm/kvm.h13
-rw-r--r--tools/arch/powerpc/include/uapi/asm/kvm.h22
-rw-r--r--tools/arch/x86/include/asm/atomic.h7
-rw-r--r--tools/arch/x86/include/asm/cmpxchg.h89
-rw-r--r--tools/arch/x86/include/asm/cpufeatures.h8
-rw-r--r--tools/build/Makefile.feature1
-rw-r--r--tools/build/feature/Makefile10
-rw-r--r--tools/build/feature/test-all.c5
-rw-r--r--tools/build/feature/test-sched_getcpu.c7
-rw-r--r--tools/include/asm-generic/atomic-gcc.h8
-rw-r--r--tools/include/linux/atomic.h6
-rw-r--r--tools/include/linux/compiler-gcc.h4
-rw-r--r--tools/include/linux/compiler.h4
-rw-r--r--tools/include/linux/kernel.h4
-rw-r--r--tools/include/linux/refcount.h151
-rw-r--r--tools/include/linux/types.h1
-rw-r--r--tools/include/uapi/linux/fcntl.h72
-rw-r--r--tools/include/uapi/linux/perf_event.h33
-rw-r--r--tools/include/uapi/linux/stat.h176
-rw-r--r--tools/lib/api/fs/fs.c29
-rw-r--r--tools/lib/api/fs/fs.h1
-rw-r--r--tools/perf/.gitignore2
-rw-r--r--tools/perf/Build1
-rw-r--r--tools/perf/Documentation/perf-ftrace.txt18
-rw-r--r--tools/perf/Documentation/perf-list.txt8
-rw-r--r--tools/perf/Documentation/perf-record.txt3
-rw-r--r--tools/perf/Documentation/perf-report.txt13
-rw-r--r--tools/perf/Documentation/perf-sched.txt4
-rw-r--r--tools/perf/Documentation/perf-script.txt16
-rw-r--r--tools/perf/Documentation/perf-stat.txt6
-rw-r--r--tools/perf/MANIFEST4
-rw-r--r--tools/perf/Makefile.config4
-rw-r--r--tools/perf/arch/powerpc/util/perf_regs.c111
-rw-r--r--tools/perf/arch/powerpc/util/sym-handling.c14
-rw-r--r--tools/perf/arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--tools/perf/arch/x86/util/perf_regs.c226
-rw-r--r--tools/perf/bench/bench.h20
-rw-r--r--tools/perf/bench/futex-hash.c4
-rw-r--r--tools/perf/bench/futex-lock-pi.c4
-rw-r--r--tools/perf/bench/futex-requeue.c4
-rw-r--r--tools/perf/bench/futex-wake-parallel.c4
-rw-r--r--tools/perf/bench/futex-wake.c4
-rw-r--r--tools/perf/bench/futex.h10
-rw-r--r--tools/perf/bench/mem-functions.c4
-rw-r--r--tools/perf/bench/numa.c5
-rw-r--r--tools/perf/bench/sched-messaging.c3
-rw-r--r--tools/perf/bench/sched-pipe.c2
-rw-r--r--tools/perf/builtin-annotate.c3
-rw-r--r--tools/perf/builtin-bench.c12
-rw-r--r--tools/perf/builtin-buildid-cache.c3
-rw-r--r--tools/perf/builtin-buildid-list.c3
-rw-r--r--tools/perf/builtin-c2c.c8
-rw-r--r--tools/perf/builtin-config.c2
-rw-r--r--tools/perf/builtin-data.c9
-rw-r--r--tools/perf/builtin-diff.c3
-rw-r--r--tools/perf/builtin-evlist.c2
-rw-r--r--tools/perf/builtin-ftrace.c154
-rw-r--r--tools/perf/builtin-help.c15
-rw-r--r--tools/perf/builtin-inject.c15
-rw-r--r--tools/perf/builtin-kallsyms.c2
-rw-r--r--tools/perf/builtin-kmem.c5
-rw-r--r--tools/perf/builtin-kvm.c18
-rw-r--r--tools/perf/builtin-list.c16
-rw-r--r--tools/perf/builtin-lock.c29
-rw-r--r--tools/perf/builtin-mem.c7
-rw-r--r--tools/perf/builtin-probe.c12
-rw-r--r--tools/perf/builtin-record.c37
-rw-r--r--tools/perf/builtin-report.c8
-rw-r--r--tools/perf/builtin-sched.c32
-rw-r--r--tools/perf/builtin-script.c309
-rw-r--r--tools/perf/builtin-stat.c192
-rw-r--r--tools/perf/builtin-timechart.c23
-rw-r--r--tools/perf/builtin-top.c2
-rw-r--r--tools/perf/builtin-trace.c85
-rw-r--r--tools/perf/builtin-version.c3
-rw-r--r--tools/perf/builtin.h58
-rwxr-xr-xtools/perf/check-headers.sh2
-rw-r--r--tools/perf/command-list.txt1
-rw-r--r--tools/perf/perf.c111
-rw-r--r--tools/perf/perf.h1
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwell/uncore.json278
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellde/uncore-cache.json28
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellde/uncore-memory.json29
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json26
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json28
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json6
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json21
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json26
-rw-r--r--tools/perf/pmu-events/arch/x86/haswell/uncore.json374
-rw-r--r--tools/perf/pmu-events/arch/x86/haswellx/uncore-cache.json28
-rw-r--r--tools/perf/pmu-events/arch/x86/haswellx/uncore-interconnect.json6
-rw-r--r--tools/perf/pmu-events/arch/x86/haswellx/uncore-memory.json21
-rw-r--r--tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json26
-rw-r--r--tools/perf/pmu-events/arch/x86/ivybridge/uncore.json314
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/uncore-cache.json22
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/uncore-interconnect.json12
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/uncore-memory.json19
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json53
-rw-r--r--tools/perf/pmu-events/arch/x86/jaketown/uncore-cache.json13
-rw-r--r--tools/perf/pmu-events/arch/x86/jaketown/uncore-interconnect.json12
-rw-r--r--tools/perf/pmu-events/arch/x86/jaketown/uncore-memory.json21
-rw-r--r--tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json53
-rw-r--r--tools/perf/pmu-events/arch/x86/mapfile.csv1
-rw-r--r--tools/perf/pmu-events/arch/x86/sandybridge/uncore.json314
-rw-r--r--tools/perf/pmu-events/arch/x86/skylake/uncore.json254
-rw-r--r--tools/perf/pmu-events/jevents.c28
-rw-r--r--tools/perf/pmu-events/jevents.h3
-rw-r--r--tools/perf/pmu-events/pmu-events.h2
-rw-r--r--tools/perf/tests/Build1
-rw-r--r--tools/perf/tests/builtin-test.c6
-rw-r--r--tools/perf/tests/cpumap.c2
-rw-r--r--tools/perf/tests/expr.c56
-rw-r--r--tools/perf/tests/sdt.c2
-rw-r--r--tools/perf/tests/tests.h1
-rw-r--r--tools/perf/tests/thread-map.c6
-rw-r--r--tools/perf/tests/thread-mg-share.c12
-rw-r--r--tools/perf/trace/beauty/Build1
-rw-r--r--tools/perf/trace/beauty/beauty.h24
-rw-r--r--tools/perf/trace/beauty/statx.c72
-rw-r--r--tools/perf/ui/browsers/hists.c183
-rw-r--r--tools/perf/ui/stdio/hist.c86
-rw-r--r--tools/perf/util/Build9
-rw-r--r--tools/perf/util/alias.c78
-rw-r--r--tools/perf/util/annotate.c35
-rw-r--r--tools/perf/util/annotate.h2
-rw-r--r--tools/perf/util/auxtrace.c4
-rw-r--r--tools/perf/util/build-id.c8
-rw-r--r--tools/perf/util/cache.h1
-rw-r--r--tools/perf/util/callchain.c163
-rw-r--r--tools/perf/util/callchain.h3
-rw-r--r--tools/perf/util/cgroup.c6
-rw-r--r--tools/perf/util/cgroup.h4
-rw-r--r--tools/perf/util/cloexec.h6
-rw-r--r--tools/perf/util/comm.c15
-rw-r--r--tools/perf/util/config.c54
-rw-r--r--tools/perf/util/cpumap.c62
-rw-r--r--tools/perf/util/cpumap.h5
-rw-r--r--tools/perf/util/data-convert-bt.c1
-rw-r--r--tools/perf/util/dso.c6
-rw-r--r--tools/perf/util/dso.h4
-rw-r--r--tools/perf/util/dump-insn.c14
-rw-r--r--tools/perf/util/dump-insn.h22
-rw-r--r--tools/perf/util/event.c159
-rw-r--r--tools/perf/util/event.h20
-rw-r--r--tools/perf/util/evlist.c31
-rw-r--r--tools/perf/util/evlist.h4
-rw-r--r--tools/perf/util/evsel.c7
-rw-r--r--tools/perf/util/evsel.h5
-rw-r--r--tools/perf/util/expr.h25
-rw-r--r--tools/perf/util/expr.y173
-rw-r--r--tools/perf/util/header.c8
-rw-r--r--tools/perf/util/help-unknown-cmd.c8
-rw-r--r--tools/perf/util/hist.c14
-rw-r--r--tools/perf/util/hist.h2
-rw-r--r--tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c26
-rw-r--r--tools/perf/util/machine.c36
-rw-r--r--tools/perf/util/machine.h3
-rw-r--r--tools/perf/util/map.c13
-rw-r--r--tools/perf/util/map.h10
-rw-r--r--tools/perf/util/namespaces.c36
-rw-r--r--tools/perf/util/namespaces.h26
-rw-r--r--tools/perf/util/parse-events.c83
-rw-r--r--tools/perf/util/parse-events.h30
-rw-r--r--tools/perf/util/parse-events.y73
-rw-r--r--tools/perf/util/perf_regs.c6
-rw-r--r--tools/perf/util/perf_regs.h7
-rw-r--r--tools/perf/util/pmu.c32
-rw-r--r--tools/perf/util/pmu.h6
-rw-r--r--tools/perf/util/probe-event.c24
-rw-r--r--tools/perf/util/probe-file.c212
-rw-r--r--tools/perf/util/probe-file.h2
-rw-r--r--tools/perf/util/session.c34
-rw-r--r--tools/perf/util/sort.c103
-rw-r--r--tools/perf/util/sort.h9
-rw-r--r--tools/perf/util/srcline.c246
-rw-r--r--tools/perf/util/stat-shadow.c197
-rw-r--r--tools/perf/util/stat.h2
-rw-r--r--tools/perf/util/symbol-elf.c30
-rw-r--r--tools/perf/util/symbol-minimal.c7
-rw-r--r--tools/perf/util/symbol.h6
-rw-r--r--tools/perf/util/thread.c50
-rw-r--r--tools/perf/util/thread.h10
-rw-r--r--tools/perf/util/thread_map.c20
-rw-r--r--tools/perf/util/thread_map.h4
-rw-r--r--tools/perf/util/tool.h2
-rw-r--r--tools/perf/util/trace-event-read.c4
-rw-r--r--tools/perf/util/util.h24
-rw-r--r--tools/perf/util/values.c63
-rw-r--r--tools/scripts/Makefile.include9
-rw-r--r--tools/testing/selftests/x86/ldt_gdt.c46
471 files changed, 14599 insertions, 4544 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 739435b8fd8e..c2f220dc0345 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -989,6 +989,7 @@
earlyprintk=ttySn[,baudrate]
earlyprintk=dbgp[debugController#]
earlyprintk=pciserial,bus:device.function[,baudrate]
+ earlyprintk=xdbc[xhciController#]
earlyprintk is useful when the kernel crashes before
the normal console is initialized. It is not enabled by
@@ -3178,6 +3179,12 @@
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
See Documentation/blockdev/ramdisk.txt.
+ ras=option[,option,...] [KNL] RAS-specific options
+
+ cec_disable [X86]
+ Disable the Correctable Errors Collector,
+ see CONFIG_RAS_CEC help text.
+
rcu_nocbs= [KNL]
The argument is a cpu list, as described above.
diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt
index 41ef9d8efe95..5ea85059db3b 100644
--- a/Documentation/trace/kprobetrace.txt
+++ b/Documentation/trace/kprobetrace.txt
@@ -8,8 +8,9 @@ Overview
--------
These events are similar to tracepoint based events. Instead of Tracepoint,
this is based on kprobes (kprobe and kretprobe). So it can probe wherever
-kprobes can probe (this means, all functions body except for __kprobes
-functions). Unlike the Tracepoint based event, this can be added and removed
+kprobes can probe (this means, all functions except those with
+__kprobes/nokprobe_inline annotation and those marked NOKPROBE_SYMBOL).
+Unlike the Tracepoint based event, this can be added and removed
dynamically, on the fly.
To enable this feature, build your kernel with CONFIG_KPROBE_EVENTS=y.
diff --git a/Documentation/usb/usb3-debug-port.rst b/Documentation/usb/usb3-debug-port.rst
new file mode 100644
index 000000000000..feb1a36a65b7
--- /dev/null
+++ b/Documentation/usb/usb3-debug-port.rst
@@ -0,0 +1,100 @@
+===============
+USB3 debug port
+===============
+
+:Author: Lu Baolu <baolu.lu@linux.intel.com>
+:Date: March 2017
+
+GENERAL
+=======
+
+This is a HOWTO for using the USB3 debug port on x86 systems.
+
+Before using any kernel debugging functionality based on USB3
+debug port, you need to::
+
+ 1) check whether any USB3 debug port is available in
+ your system;
+ 2) check which port is used for debugging purposes;
+ 3) have a USB 3.0 super-speed A-to-A debugging cable.
+
+INTRODUCTION
+============
+
+The xHCI debug capability (DbC) is an optional but standalone
+functionality provided by the xHCI host controller. The xHCI
+specification describes DbC in the section 7.6.
+
+When DbC is initialized and enabled, it will present a debug
+device through the debug port (normally the first USB3
+super-speed port). The debug device is fully compliant with
+the USB framework and provides the equivalent of a very high
+performance full-duplex serial link between the debug target
+(the system under debugging) and a debug host.
+
+EARLY PRINTK
+============
+
+DbC has been designed to log early printk messages. One use for
+this feature is kernel debugging. For example, when your machine
+crashes very early before the regular console code is initialized.
+Other uses include simpler, lockless logging instead of a full-
+blown printk console driver and klogd.
+
+On the debug target system, you need to customize a debugging
+kernel with CONFIG_EARLY_PRINTK_USB_XDBC enabled. And, add below
+kernel boot parameter::
+
+ "earlyprintk=xdbc"
+
+If there are multiple xHCI controllers in your system, you can
+append a host contoller index to this kernel parameter. This
+index starts from 0.
+
+Current design doesn't support DbC runtime suspend/resume. As
+the result, you'd better disable runtime power management for
+USB subsystem by adding below kernel boot parameter::
+
+ "usbcore.autosuspend=-1"
+
+Before starting the debug target, you should connect the debug
+port to a USB port (root port or port of any external hub) on
+the debug host. The cable used to connect these two ports
+should be a USB 3.0 super-speed A-to-A debugging cable.
+
+During early boot of the debug target, DbC will be detected and
+initialized. After initialization, the debug host should be able
+to enumerate the debug device in debug target. The debug host
+will then bind the debug device with the usb_debug driver module
+and create the /dev/ttyUSB device.
+
+If the debug device enumeration goes smoothly, you should be able
+to see below kernel messages on the debug host::
+
+ # tail -f /var/log/kern.log
+ [ 1815.983374] usb 4-3: new SuperSpeed USB device number 4 using xhci_hcd
+ [ 1815.999595] usb 4-3: LPM exit latency is zeroed, disabling LPM.
+ [ 1815.999899] usb 4-3: New USB device found, idVendor=1d6b, idProduct=0004
+ [ 1815.999902] usb 4-3: New USB device strings: Mfr=1, Product=2, SerialNumber=3
+ [ 1815.999903] usb 4-3: Product: Remote GDB
+ [ 1815.999904] usb 4-3: Manufacturer: Linux
+ [ 1815.999905] usb 4-3: SerialNumber: 0001
+ [ 1816.000240] usb_debug 4-3:1.0: xhci_dbc converter detected
+ [ 1816.000360] usb 4-3: xhci_dbc converter now attached to ttyUSB0
+
+You can use any communication program, for example minicom, to
+read and view the messages. Below simple bash scripts can help
+you to check the sanity of the setup.
+
+.. code-block:: sh
+
+ ===== start of bash scripts =============
+ #!/bin/bash
+
+ while true ; do
+ while [ ! -d /sys/class/tty/ttyUSB0 ] ; do
+ :
+ done
+ cat /dev/ttyUSB0
+ done
+ ===== end of bash scripts ===============
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 5724092db811..b0798e281aa6 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -4,7 +4,7 @@
Virtual memory map with 4 level page tables:
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
-hole caused by [48:63] sign extension
+hole caused by [47:63] sign extension
ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
@@ -19,16 +19,43 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
+ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
+ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
+
+Virtual memory map with 5 level page tables:
+
+0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
+hole caused by [56:63] sign extension
+ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
+ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
+ff90000000000000 - ff91ffffffffffff (=49 bits) hole
+ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
+ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
+ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
+... unused hole ...
+ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB)
+... unused hole ...
+ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
+... unused hole ...
+ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
+... unused hole ...
+ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
+Architecture defines a 64-bit virtual address. Implementations can support
+less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
+through to the most-significant implemented bit are set to either all ones
+or all zero. This causes hole between user space and kernel addresses.
+
The direct mapping covers all memory in the system up to the highest
memory address (this means in some cases it can also include PCI memory
holes).
-vmalloc space is lazily synchronized into the different PML4 pages of
-the processes using the page fault handler, with init_level4_pgt as
+vmalloc space is lazily synchronized into the different PML4/PML5 pages of
+the processes using the page fault handler, with init_top_pgt as
reference.
Current X86-64 implementations support up to 46 bits of address space (64 TB),
@@ -39,6 +66,9 @@ memory window (this size is arbitrary, it can be raised later if needed).
The mappings are not part of any other kernel PGD and are only available
during EFI runtime calls.
+The module mapping space size changes based on the CONFIG requirements for the
+following fixmap section.
+
Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
physical memory, vmalloc/ioremap space and virtual memory map are randomized.
Their order is preserved but their base will be offset early at boot time.
diff --git a/MAINTAINERS b/MAINTAINERS
index 85b55cf5469a..0ae478277f15 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11204,6 +11204,7 @@ F: drivers/power/supply/bq27xxx_battery_i2c.c
TIMEKEEPING, CLOCKSOURCE CORE, NTP, ALARMTIMER
M: John Stultz <john.stultz@linaro.org>
M: Thomas Gleixner <tglx@linutronix.de>
+R: Stephen Boyd <sboyd@codeaurora.org>
L: linux-kernel@vger.kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
S: Supported
diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..c4d6833aacd9 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -700,6 +700,13 @@ config ARCH_MMAP_RND_COMPAT_BITS
This value can be changed after boot using the
/proc/sys/vm/mmap_rnd_compat_bits tunable
+config HAVE_ARCH_COMPAT_MMAP_BASES
+ bool
+ help
+ This allows 64bit applications to invoke 32-bit mmap() syscall
+ and vice-versa 32-bit applications to call 64-bit mmap().
+ Required for applications doing different bitness syscalls.
+
config HAVE_COPY_THREAD_TLS
bool
help
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 7b73657d790e..b0e8c9763e94 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1657,7 +1657,7 @@ config ARCH_SELECT_MEMORY_MODEL
config HAVE_ARCH_PFN_VALID
def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
-config HAVE_GENERIC_RCU_GUP
+config HAVE_GENERIC_GUP
def_bool y
depends on ARM_LPAE
diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h
index e4e6a9d6a825..17f1f1a814ff 100644
--- a/arch/arm/include/asm/efi.h
+++ b/arch/arm/include/asm/efi.h
@@ -85,6 +85,18 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
*/
#define ZIMAGE_OFFSET_LIMIT SZ_128M
#define MIN_ZIMAGE_OFFSET MAX_UNCOMP_KERNEL_SIZE
-#define MAX_FDT_OFFSET ZIMAGE_OFFSET_LIMIT
+
+/* on ARM, the FDT should be located in the first 128 MB of RAM */
+static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base)
+{
+ return dram_base + ZIMAGE_OFFSET_LIMIT;
+}
+
+/* on ARM, the initrd should be loaded in a lowmem region */
+static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base,
+ unsigned long image_addr)
+{
+ return dram_base + SZ_512M;
+}
#endif /* _ASM_ARM_EFI_H */
diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S
index 37b2a11af345..8265b116218d 100644
--- a/arch/arm/kernel/vmlinux-xip.lds.S
+++ b/arch/arm/kernel/vmlinux-xip.lds.S
@@ -242,6 +242,8 @@ SECTIONS
}
_edata_loc = __data_loc + SIZEOF(.data);
+ BUG_TABLE
+
#ifdef CONFIG_HAVE_TCM
/*
* We align everything to a page boundary so we can
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index ce18007f9e4e..c83a7ba737d6 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -262,6 +262,8 @@ SECTIONS
}
_edata_loc = __data_loc + SIZEOF(.data);
+ BUG_TABLE
+
#ifdef CONFIG_HAVE_TCM
/*
* We align everything to a page boundary so we can
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 4fece72bcd63..9b8fcab7da56 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -204,7 +204,7 @@ config GENERIC_CALIBRATE_DELAY
config ZONE_DMA
def_bool y
-config HAVE_GENERIC_RCU_GUP
+config HAVE_GENERIC_GUP
def_bool y
config ARCH_DMA_ADDR_T_64BIT
diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h
index 0bfe1df12b19..366448eb0fb7 100644
--- a/arch/arm64/include/asm/bug.h
+++ b/arch/arm64/include/asm/bug.h
@@ -59,8 +59,7 @@ _BUGVERBOSE_LOCATION(__FILE__, __LINE__) \
unreachable(); \
} while (0)
-#define __WARN_TAINT(taint) \
- __BUG_FLAGS(BUGFLAG_TAINT(taint))
+#define __WARN_FLAGS(flags) __BUG_FLAGS(BUGFLAG_WARNING|(flags))
#define HAVE_ARCH_BUG
diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h
index e7445281e534..8f3043aba873 100644
--- a/arch/arm64/include/asm/efi.h
+++ b/arch/arm64/include/asm/efi.h
@@ -1,6 +1,7 @@
#ifndef _ASM_EFI_H
#define _ASM_EFI_H
+#include <asm/boot.h>
#include <asm/cpufeature.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -46,7 +47,28 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md);
* 2MiB so we know it won't cross a 2MiB boundary.
*/
#define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */
-#define MAX_FDT_OFFSET SZ_512M
+
+/* on arm64, the FDT may be located anywhere in system RAM */
+static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base)
+{
+ return ULONG_MAX;
+}
+
+/*
+ * On arm64, we have to ensure that the initrd ends up in the linear region,
+ * which is a 1 GB aligned region of size '1UL << (VA_BITS - 1)' that is
+ * guaranteed to cover the kernel Image.
+ *
+ * Since the EFI stub is part of the kernel Image, we can relax the
+ * usual requirements in Documentation/arm64/booting.txt, which still
+ * apply to other bootloaders, and are required for some kernel
+ * configurations.
+ */
+static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base,
+ unsigned long image_addr)
+{
+ return (image_addr & ~(SZ_1G - 1UL)) + (1UL << (VA_BITS - 1));
+}
#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__)
#define __efi_call_early(f, ...) f(__VA_ARGS__)
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 64d9cbd61678..e25c11e727fe 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -18,6 +18,7 @@
#include <linux/acpi.h>
#include <linux/bootmem.h>
#include <linux/cpumask.h>
+#include <linux/efi-bgrt.h>
#include <linux/init.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
@@ -233,6 +234,8 @@ done:
early_init_dt_scan_chosen_stdout();
} else {
parse_spcr(earlycon_init_is_deferred);
+ if (IS_ENABLED(CONFIG_ACPI_BGRT))
+ acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
}
}
diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S
index 17f2730eb497..623d18db0466 100644
--- a/arch/avr32/kernel/vmlinux.lds.S
+++ b/arch/avr32/kernel/vmlinux.lds.S
@@ -75,6 +75,7 @@ SECTIONS
_edata = .;
}
+ BUG_TABLE
BSS_SECTION(0, 8, 8)
_end = .;
diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S
index 68069a120055..334ef8139b35 100644
--- a/arch/blackfin/kernel/vmlinux.lds.S
+++ b/arch/blackfin/kernel/vmlinux.lds.S
@@ -115,6 +115,8 @@ SECTIONS
__data_lma = LOADADDR(.data);
__data_len = SIZEOF(.data);
+ BUG_TABLE
+
/* The init section should be last, so when we free it, it goes into
* the general memory pool, and (hopefully) will decrease fragmentation
* a tiny bit. The init section has a _requirement_ that it be
diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S
index a1a5c166bc9b..29ebea49ddd5 100644
--- a/arch/c6x/kernel/vmlinux.lds.S
+++ b/arch/c6x/kernel/vmlinux.lds.S
@@ -128,6 +128,8 @@ SECTIONS
. = ALIGN(8);
}
+ BUG_TABLE
+
_edata = .;
__bss_start = .;
diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S
index 979586261520..867f237d7c5c 100644
--- a/arch/cris/kernel/vmlinux.lds.S
+++ b/arch/cris/kernel/vmlinux.lds.S
@@ -68,6 +68,8 @@ SECTIONS
__edata = . ; /* End of data section. */
_edata = . ;
+ BUG_TABLE
+
INIT_TASK_DATA_SECTION(PAGE_SIZE)
. = ALIGN(PAGE_SIZE); /* Init code and data. */
diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S
index aa6e573d57da..3f44dcbbad4d 100644
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -102,6 +102,8 @@ SECTIONS
_edata = .; /* End of data section */
+ BUG_TABLE
+
/* GP section */
. = ALIGN(L1_CACHE_BYTES);
_gp = . + 2048;
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index f89d20c97412..798026dde52e 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -192,6 +192,8 @@ SECTIONS {
CONSTRUCTORS
}
+ BUG_TABLE
+
. = ALIGN(16); /* gp must be 16-byte aligned for exc. table */
.got : AT(ADDR(.got) - LOAD_OFFSET) {
*(.got.plt)
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index f0a0e6d62be3..8ca2371aa684 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -97,6 +97,7 @@ SECTIONS
DATA_DATA
CONSTRUCTORS
}
+ BUG_TABLE
_gp = . + 0x8000;
.lit8 : {
*(.lit8)
diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h
index 62a33338549c..d2742273a685 100644
--- a/arch/parisc/include/asm/bug.h
+++ b/arch/parisc/include/asm/bug.h
@@ -46,7 +46,7 @@
#endif
#ifdef CONFIG_DEBUG_BUGVERBOSE
-#define __WARN_TAINT(taint) \
+#define __WARN_FLAGS(flags) \
do { \
asm volatile("\n" \
"1:\t" PARISC_BUG_BREAK_ASM "\n" \
@@ -56,11 +56,11 @@
"\t.org 2b+%c3\n" \
"\t.popsection" \
: : "i" (__FILE__), "i" (__LINE__), \
- "i" (BUGFLAG_TAINT(taint)), \
+ "i" (BUGFLAG_WARNING|(flags)), \
"i" (sizeof(struct bug_entry)) ); \
} while(0)
#else
-#define __WARN_TAINT(taint) \
+#define __WARN_FLAGS(flags) \
do { \
asm volatile("\n" \
"1:\t" PARISC_BUG_BREAK_ASM "\n" \
@@ -69,7 +69,7 @@
"\t.short %c0\n" \
"\t.org 2b+%c1\n" \
"\t.popsection" \
- : : "i" (BUGFLAG_TAINT(taint)), \
+ : : "i" (BUGFLAG_WARNING|(flags)), \
"i" (sizeof(struct bug_entry)) ); \
} while(0)
#endif
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index de29e18d9345..152ebc1b58ec 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -134,7 +134,7 @@ config PPC
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS
- select HAVE_GENERIC_RCU_GUP
+ select HAVE_GENERIC_GUP
select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx)
select HAVE_IDE
select HAVE_IOREMAP_PROT
diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 3a39283333c3..f2c562a0a427 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -85,12 +85,12 @@
} \
} while (0)
-#define __WARN_TAINT(taint) do { \
+#define __WARN_FLAGS(flags) do { \
__asm__ __volatile__( \
"1: twi 31,0,0\n" \
_EMIT_BUG_ENTRY \
: : "i" (__FILE__), "i" (__LINE__), \
- "i" (BUGFLAG_TAINT(taint)), \
+ "i" (BUGFLAG_WARNING|(flags)), \
"i" (sizeof(struct bug_entry))); \
} while (0)
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 78803a7ebdd9..9510ef04b0f5 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -164,11 +164,5 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
/* by default, allow everything */
return true;
}
-
-static inline bool arch_pte_access_permitted(pte_t pte, bool write)
-{
- /* by default, allow everything */
- return true;
-}
#endif /* __KERNEL__ */
#endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 7394b770ae1f..1c24c894c908 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -312,6 +312,8 @@ SECTIONS
NOSAVE_DATA
}
+ BUG_TABLE
+
. = ALIGN(PAGE_SIZE);
_edata = .;
PROVIDE32 (edata = .);
diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h
index bf90d1fd97a5..1bbd9dbfe4e0 100644
--- a/arch/s390/include/asm/bug.h
+++ b/arch/s390/include/asm/bug.h
@@ -46,8 +46,8 @@
unreachable(); \
} while (0)
-#define __WARN_TAINT(taint) do { \
- __EMIT_BUG(BUGFLAG_TAINT(taint)); \
+#define __WARN_FLAGS(flags) do { \
+ __EMIT_BUG(BUGFLAG_WARNING|(flags)); \
} while (0)
#define WARN_ON(x) ({ \
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 6e31d87fb669..fa2bf69be182 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -156,10 +156,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
/* by default, allow everything */
return true;
}
-
-static inline bool arch_pte_access_permitted(pte_t pte, bool write)
-{
- /* by default, allow everything */
- return true;
-}
#endif /* __S390_MMU_CONTEXT_H */
diff --git a/arch/sh/include/asm/bug.h b/arch/sh/include/asm/bug.h
index dcf278075429..1b77f068be2b 100644
--- a/arch/sh/include/asm/bug.h
+++ b/arch/sh/include/asm/bug.h
@@ -50,7 +50,7 @@ do { \
"i" (sizeof(struct bug_entry))); \
} while (0)
-#define __WARN_TAINT(taint) \
+#define __WARN_FLAGS(flags) \
do { \
__asm__ __volatile__ ( \
"1:\t.short %O0\n" \
@@ -59,7 +59,7 @@ do { \
: "n" (TRAPA_BUG_OPCODE), \
"i" (__FILE__), \
"i" (__LINE__), \
- "i" (BUGFLAG_TAINT(taint)), \
+ "i" (BUGFLAG_WARNING|(flags)), \
"i" (sizeof(struct bug_entry))); \
} while (0)
diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common
index fd443852103c..ed9c5b5ff028 100644
--- a/arch/um/Kconfig.common
+++ b/arch/um/Kconfig.common
@@ -50,11 +50,6 @@ config GENERIC_CALIBRATE_DELAY
bool
default y
-config GENERIC_BUG
- bool
- default y
- depends on BUG
-
config HZ
int
default 100
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index 94ac2739918c..b668e351fd6c 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -37,12 +37,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return true;
}
-static inline bool arch_pte_access_permitted(pte_t pte, bool write)
-{
- /* by default, allow everything */
- return true;
-}
-
/*
* end asm-generic/mm_hooks.h functions
*/
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index de5d572225f3..cd1fa97776c3 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -302,8 +302,8 @@ extern int ignore_sigio_fd(int fd);
extern void maybe_sigio_broken(int fd, int read);
extern void sigio_broken(int fd, int read);
-/* sys-x86_64/prctl.c */
-extern int os_arch_prctl(int pid, int code, unsigned long *addr);
+/* prctl.c */
+extern int os_arch_prctl(int pid, int option, unsigned long *arg2);
/* tty.c */
extern int get_pty(void);
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index 62dfc644c908..59b06b48f27d 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -103,10 +103,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
/* by default, allow everything */
return true;
}
-
-static inline bool arch_pte_access_permitted(pte_t pte, bool write)
-{
- /* by default, allow everything */
- return true;
-}
#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1d50fdff77ee..df616ee3f4c3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -105,6 +105,7 @@ config X86
select HAVE_ARCH_KMEMCHECK
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
+ select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
@@ -126,7 +127,7 @@ config X86
select HAVE_EBPF_JIT if X86_64
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EXIT_THREAD
- select HAVE_FENTRY if X86_64
+ select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
@@ -289,6 +290,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
config KASAN_SHADOW_OFFSET
hex
depends on KASAN
+ default 0xdff8000000000000 if X86_5LEVEL
default 0xdffffc0000000000
config HAVE_INTEL_TXT
@@ -1042,6 +1044,14 @@ config X86_MCE
The action the kernel takes depends on the severity of the problem,
ranging from warning messages to halting the machine.
+config X86_MCELOG_LEGACY
+ bool "Support for deprecated /dev/mcelog character device"
+ depends on X86_MCE
+ ---help---
+ Enable support for /dev/mcelog which is needed by the old mcelog
+ userspace logging daemon. Consider switching to the new generation
+ rasdaemon solution.
+
config X86_MCE_INTEL
def_bool y
prompt "Intel MCE features"
@@ -1071,7 +1081,7 @@ config X86_MCE_THRESHOLD
def_bool y
config X86_MCE_INJECT
- depends on X86_MCE && X86_LOCAL_APIC
+ depends on X86_MCE && X86_LOCAL_APIC && X86_MCELOG_LEGACY
tristate "Machine check injector support"
---help---
Provide support for injecting machine checks for testing purposes.
@@ -2786,6 +2796,9 @@ config X86_DMA_REMAP
bool
depends on STA2X11
+config HAVE_GENERIC_GUP
+ def_bool y
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 63c1d13aaf9f..fcb7604172ce 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,6 +5,9 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
+config EARLY_PRINTK_USB
+ bool
+
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
@@ -23,19 +26,20 @@ config EARLY_PRINTK
This is useful for kernel debugging when your machine crashes very
early before the console code is initialized. For normal operation
it is not recommended because it looks ugly and doesn't cooperate
- with klogd/syslogd or the X server. You should normally N here,
+ with klogd/syslogd or the X server. You should normally say N here,
unless you want to debug such a crash.
config EARLY_PRINTK_DBGP
bool "Early printk via EHCI debug port"
depends on EARLY_PRINTK && PCI
+ select EARLY_PRINTK_USB
---help---
Write kernel log output directly into the EHCI debug port.
This is useful for kernel debugging when your machine crashes very
early before the console code is initialized. For normal operation
it is not recommended because it looks ugly and doesn't cooperate
- with klogd/syslogd or the X server. You should normally N here,
+ with klogd/syslogd or the X server. You should normally say N here,
unless you want to debug such a crash. You need usb debug device.
config EARLY_PRINTK_EFI
@@ -48,6 +52,25 @@ config EARLY_PRINTK_EFI
This is useful for kernel debugging when your machine crashes very
early before the console code is initialized.
+config EARLY_PRINTK_USB_XDBC
+ bool "Early printk via the xHCI debug port"
+ depends on EARLY_PRINTK && PCI
+ select EARLY_PRINTK_USB
+ ---help---
+ Write kernel log output directly into the xHCI debug port.
+
+ One use for this feature is kernel debugging, for example when your
+ machine crashes very early before the regular console code is
+ initialized. Other uses include simpler, lockless logging instead of
+ a full-blown printk console driver + klogd.
+
+ For normal production environments this is normally not recommended,
+ because it doesn't feed events into klogd/syslogd and doesn't try to
+ print anything on the screen.
+
+ You should normally say N here, unless you want to debug early
+ crashes or need a very simple printk logging facility.
+
config X86_PTDUMP_CORE
def_bool n
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 4ad7d70e8739..8f0c4c9fc904 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -44,6 +44,15 @@ static const u32 req_flags[NCAPINTS] =
0, /* REQUIRED_MASK5 not implemented in this file */
REQUIRED_MASK6,
0, /* REQUIRED_MASK7 not implemented in this file */
+ 0, /* REQUIRED_MASK8 not implemented in this file */
+ 0, /* REQUIRED_MASK9 not implemented in this file */
+ 0, /* REQUIRED_MASK10 not implemented in this file */
+ 0, /* REQUIRED_MASK11 not implemented in this file */
+ 0, /* REQUIRED_MASK12 not implemented in this file */
+ 0, /* REQUIRED_MASK13 not implemented in this file */
+ 0, /* REQUIRED_MASK14 not implemented in this file */
+ 0, /* REQUIRED_MASK15 not implemented in this file */
+ REQUIRED_MASK16,
};
#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index 6687ab953257..9e77c23c2422 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -70,16 +70,19 @@ int has_eflag(unsigned long mask)
# define EBX_REG "=b"
#endif
-static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d)
+static inline void cpuid_count(u32 id, u32 count,
+ u32 *a, u32 *b, u32 *c, u32 *d)
{
asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
"cpuid \n\t"
".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t"
: "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
- : "a" (id)
+ : "a" (id), "c" (count)
);
}
+#define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d)
+
void get_cpuflags(void)
{
u32 max_intel_level, max_amd_level;
@@ -108,6 +111,11 @@ void get_cpuflags(void)
cpu.model += ((tfms >> 16) & 0xf) << 4;
}
+ if (max_intel_level >= 0x00000007) {
+ cpuid_count(0x00000007, 0, &ignored, &ignored,
+ &cpu.flags[16], &ignored);
+ }
+
cpuid(0x80000000, &max_amd_level, &ignored, &ignored,
&ignored);
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 57f7ec35216e..50bc26949e9e 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -35,16 +35,13 @@
#include <asm/errno.h>
#include <asm/segment.h>
#include <asm/smp.h>
-#include <asm/page_types.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
-#include <asm/ftrace.h>
#include <asm/irq_vectors.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/smap.h>
-#include <asm/export.h>
#include <asm/frame.h>
.section .entry.text, "ax"
@@ -585,7 +582,7 @@ ENTRY(iret_exc )
* will soon execute iret and the tracer was already set to
* the irqstate after the IRET:
*/
- DISABLE_INTERRUPTS(CLBR_EAX)
+ DISABLE_INTERRUPTS(CLBR_ANY)
lss (%esp), %esp /* switch to espfix segment */
jmp .Lrestore_nocheck
#endif
@@ -886,172 +883,6 @@ BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
#endif /* CONFIG_HYPERV */
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-ENTRY(mcount)
- ret
-END(mcount)
-
-ENTRY(ftrace_caller)
- pushl %eax
- pushl %ecx
- pushl %edx
- pushl $0 /* Pass NULL as regs pointer */
- movl 4*4(%esp), %eax
- movl 0x4(%ebp), %edx
- movl function_trace_op, %ecx
- subl $MCOUNT_INSN_SIZE, %eax
-
-.globl ftrace_call
-ftrace_call:
- call ftrace_stub
-
- addl $4, %esp /* skip NULL pointer */
- popl %edx
- popl %ecx
- popl %eax
-.Lftrace_ret:
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
- jmp ftrace_stub
-#endif
-
-/* This is weak to keep gas from relaxing the jumps */
-WEAK(ftrace_stub)
- ret
-END(ftrace_caller)
-
-ENTRY(ftrace_regs_caller)
- pushf /* push flags before compare (in cs location) */
-
- /*
- * i386 does not save SS and ESP when coming from kernel.
- * Instead, to get sp, &regs->sp is used (see ptrace.h).
- * Unfortunately, that means eflags must be at the same location
- * as the current return ip is. We move the return ip into the
- * ip location, and move flags into the return ip location.
- */
- pushl 4(%esp) /* save return ip into ip slot */
-
- pushl $0 /* Load 0 into orig_ax */
- pushl %gs
- pushl %fs
- pushl %es
- pushl %ds
- pushl %eax
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %edx
- pushl %ecx
- pushl %ebx
-
- movl 13*4(%esp), %eax /* Get the saved flags */
- movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
- /* clobbering return ip */
- movl $__KERNEL_CS, 13*4(%esp)
-
- movl 12*4(%esp), %eax /* Load ip (1st parameter) */
- subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
- movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
- movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
- pushl %esp /* Save pt_regs as 4th parameter */
-
-GLOBAL(ftrace_regs_call)
- call ftrace_stub
-
- addl $4, %esp /* Skip pt_regs */
- movl 14*4(%esp), %eax /* Move flags back into cs */
- movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
- movl 12*4(%esp), %eax /* Get return ip from regs->ip */
- movl %eax, 14*4(%esp) /* Put return ip back for ret */
-
- popl %ebx
- popl %ecx
- popl %edx
- popl %esi
- popl %edi
- popl %ebp
- popl %eax
- popl %ds
- popl %es
- popl %fs
- popl %gs
- addl $8, %esp /* Skip orig_ax and ip */
- popf /* Pop flags at end (no addl to corrupt flags) */
- jmp .Lftrace_ret
-
- popf
- jmp ftrace_stub
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(mcount)
- cmpl $__PAGE_OFFSET, %esp
- jb ftrace_stub /* Paging not enabled yet? */
-
- cmpl $ftrace_stub, ftrace_trace_function
- jnz .Ltrace
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpl $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-.globl ftrace_stub
-ftrace_stub:
- ret
-
- /* taken from glibc */
-.Ltrace:
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- movl 0x4(%ebp), %edx
- subl $MCOUNT_INSN_SIZE, %eax
-
- call *ftrace_trace_function
-
- popl %edx
- popl %ecx
- popl %eax
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-EXPORT_SYMBOL(mcount)
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- lea 0x4(%ebp), %edx
- movl (%ebp), %ecx
- subl $MCOUNT_INSN_SIZE, %eax
- call prepare_ftrace_return
- popl %edx
- popl %ecx
- popl %eax
- ret
-END(ftrace_graph_caller)
-
-.globl return_to_handler
-return_to_handler:
- pushl %eax
- pushl %edx
- movl %ebp, %eax
- call ftrace_return_to_handler
- movl %eax, %ecx
- popl %edx
- popl %eax
- jmp *%ecx
-#endif
-
#ifdef CONFIG_TRACING
ENTRY(trace_page_fault)
ASM_CLAC
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 044d18ebc43c..607d72c4a485 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -212,7 +212,7 @@ entry_SYSCALL_64_fastpath:
* If we see that no exit work is required (which we are required
* to check with IRQs off), then we can go straight to SYSRET64.
*/
- DISABLE_INTERRUPTS(CLBR_NONE)
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movq PER_CPU_VAR(current_task), %r11
testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
@@ -233,7 +233,7 @@ entry_SYSCALL_64_fastpath:
* raise(3) will trigger this, for example. IRQs are off.
*/
TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
+ ENABLE_INTERRUPTS(CLBR_ANY)
SAVE_EXTRA_REGS
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
@@ -265,12 +265,9 @@ return_from_SYSCALL_64:
*
* If width of "canonical tail" ever becomes variable, this will need
* to be updated to remain correct on both old and new CPUs.
+ *
+ * Change top 16 bits to be the sign-extension of 47th bit
*/
- .ifne __VIRTUAL_MASK_SHIFT - 47
- .error "virtual address width changed -- SYSRET checks need update"
- .endif
-
- /* Change top 16 bits to be the sign-extension of 47th bit */
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
@@ -343,7 +340,7 @@ ENTRY(stub_ptregs_64)
* Called from fast path -- disable IRQs again, pop return address
* and jump to slow path
*/
- DISABLE_INTERRUPTS(CLBR_NONE)
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
popq %rax
jmp entry_SYSCALL64_slow_path
@@ -518,7 +515,7 @@ common_interrupt:
interrupt do_IRQ
/* 0(%rsp): old RSP */
ret_from_intr:
- DISABLE_INTERRUPTS(CLBR_NONE)
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
@@ -1051,7 +1048,7 @@ END(paranoid_entry)
* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
*/
ENTRY(paranoid_exit)
- DISABLE_INTERRUPTS(CLBR_NONE)
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
@@ -1156,10 +1153,9 @@ END(error_entry)
* 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
*/
ENTRY(error_exit)
- movl %ebx, %eax
- DISABLE_INTERRUPTS(CLBR_NONE)
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- testl %eax, %eax
+ testl %ebx, %ebx
jnz retint_kernel
jmp retint_user
END(error_exit)
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 9ba050fe47f3..0af59fa789ea 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -390,3 +390,4 @@
381 i386 pkey_alloc sys_pkey_alloc
382 i386 pkey_free sys_pkey_free
383 i386 statx sys_statx
+384 i386 arch_prctl sys_arch_prctl compat_sys_arch_prctl
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 9d4d6e138311..fa8dbfcf7ed3 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -17,6 +17,7 @@
#include <asm/unistd.h>
#include <asm/msr.h>
#include <asm/pvclock.h>
+#include <asm/mshyperv.h>
#include <linux/math64.h>
#include <linux/time.h>
#include <linux/kernel.h>
@@ -32,6 +33,11 @@ extern u8 pvclock_page
__attribute__((visibility("hidden")));
#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+extern u8 hvclock_page
+ __attribute__((visibility("hidden")));
+#endif
+
#ifndef BUILD_VDSO32
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
@@ -141,6 +147,20 @@ static notrace u64 vread_pvclock(int *mode)
return last;
}
#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+static notrace u64 vread_hvclock(int *mode)
+{
+ const struct ms_hyperv_tsc_page *tsc_pg =
+ (const struct ms_hyperv_tsc_page *)&hvclock_page;
+ u64 current_tick = hv_read_tsc_page(tsc_pg);
+
+ if (current_tick != U64_MAX)
+ return current_tick;
+
+ *mode = VCLOCK_NONE;
+ return 0;
+}
+#endif
notrace static u64 vread_tsc(void)
{
@@ -173,6 +193,10 @@ notrace static inline u64 vgetsns(int *mode)
else if (gtod->vclock_mode == VCLOCK_PVCLOCK)
cycles = vread_pvclock(mode);
#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+ else if (gtod->vclock_mode == VCLOCK_HVCLOCK)
+ cycles = vread_hvclock(mode);
+#endif
else
return 0;
v = (cycles - gtod->cycle_last) & gtod->mask;
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index a708aa90b507..8ebb4b6454fe 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -25,7 +25,7 @@ SECTIONS
* segment.
*/
- vvar_start = . - 2 * PAGE_SIZE;
+ vvar_start = . - 3 * PAGE_SIZE;
vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
@@ -36,6 +36,7 @@ SECTIONS
#undef EMIT_VVAR
pvclock_page = vvar_start + PAGE_SIZE;
+ hvclock_page = vvar_start + 2 * PAGE_SIZE;
. = SIZEOF_HEADERS;
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 491020b2826d..0780a443a53b 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -74,6 +74,7 @@ enum {
sym_vvar_page,
sym_hpet_page,
sym_pvclock_page,
+ sym_hvclock_page,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
};
@@ -82,6 +83,7 @@ const int special_pages[] = {
sym_vvar_page,
sym_hpet_page,
sym_pvclock_page,
+ sym_hvclock_page,
};
struct vdso_sym {
@@ -94,6 +96,7 @@ struct vdso_sym required_syms[] = {
[sym_vvar_page] = {"vvar_page", true},
[sym_hpet_page] = {"hpet_page", true},
[sym_pvclock_page] = {"pvclock_page", true},
+ [sym_hvclock_page] = {"hvclock_page", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
},
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 226ca70dc6bd..139ad7726e10 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -22,6 +22,7 @@
#include <asm/page.h>
#include <asm/desc.h>
#include <asm/cpufeature.h>
+#include <asm/mshyperv.h>
#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
@@ -121,6 +122,12 @@ static int vvar_fault(const struct vm_special_mapping *sm,
vmf->address,
__pa(pvti) >> PAGE_SHIFT);
}
+ } else if (sym_offset == image->sym_hvclock_page) {
+ struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
+
+ if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
+ ret = vm_insert_pfn(vma, vmf->address,
+ vmalloc_to_pfn(tsc_pg));
}
if (ret == 0 || ret == -EBUSY)
@@ -354,7 +361,7 @@ static void vgetcpu_cpu_init(void *arg)
d.p = 1; /* Present */
d.d = 1; /* 32-bit */
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}
static int vgetcpu_online(unsigned int cpu)
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index b28200dea715..3641e24fdac5 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -11,6 +11,8 @@
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) "perf/amd_iommu: " fmt
+
#include <linux/perf_event.h>
#include <linux/init.h>
#include <linux/cpumask.h>
@@ -21,44 +23,42 @@
#define COUNTER_SHIFT 16
-#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg))
+/* iommu pmu conf masks */
+#define GET_CSOURCE(x) ((x)->conf & 0xFFULL)
+#define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL)
+#define GET_DOMID(x) (((x)->conf >> 24) & 0xFFFFULL)
+#define GET_PASID(x) (((x)->conf >> 40) & 0xFFFFFULL)
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL)
-#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+/* iommu pmu conf1 masks */
+#define GET_DEVID_MASK(x) ((x)->conf1 & 0xFFFFULL)
+#define GET_DOMID_MASK(x) (((x)->conf1 >> 16) & 0xFFFFULL)
+#define GET_PASID_MASK(x) (((x)->conf1 >> 32) & 0xFFFFFULL)
-static struct perf_amd_iommu __perf_iommu;
+#define IOMMU_NAME_SIZE 16
struct perf_amd_iommu {
+ struct list_head list;
struct pmu pmu;
+ struct amd_iommu *iommu;
+ char name[IOMMU_NAME_SIZE];
u8 max_banks;
u8 max_counters;
u64 cntr_assign_mask;
raw_spinlock_t lock;
- const struct attribute_group *attr_groups[4];
};
-#define format_group attr_groups[0]
-#define cpumask_group attr_groups[1]
-#define events_group attr_groups[2]
-#define null_group attr_groups[3]
+static LIST_HEAD(perf_amd_iommu_list);
/*---------------------------------------------
* sysfs format attributes
*---------------------------------------------*/
PMU_FORMAT_ATTR(csource, "config:0-7");
PMU_FORMAT_ATTR(devid, "config:8-23");
-PMU_FORMAT_ATTR(pasid, "config:24-39");
-PMU_FORMAT_ATTR(domid, "config:40-55");
+PMU_FORMAT_ATTR(domid, "config:24-39");
+PMU_FORMAT_ATTR(pasid, "config:40-59");
PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+PMU_FORMAT_ATTR(domid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(pasid_mask, "config1:32-51");
static struct attribute *iommu_format_attrs[] = {
&format_attr_csource.attr,
@@ -79,6 +79,10 @@ static struct attribute_group amd_iommu_format_group = {
/*---------------------------------------------
* sysfs events attributes
*---------------------------------------------*/
+static struct attribute_group amd_iommu_events_group = {
+ .name = "events",
+};
+
struct amd_iommu_event_desc {
struct kobj_attribute attr;
const char *event;
@@ -150,30 +154,34 @@ static struct attribute_group amd_iommu_cpumask_group = {
/*---------------------------------------------*/
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+static int get_next_avail_iommu_bnk_cntr(struct perf_event *event)
{
+ struct perf_amd_iommu *piommu = container_of(event->pmu, struct perf_amd_iommu, pmu);
+ int max_cntrs = piommu->max_counters;
+ int max_banks = piommu->max_banks;
+ u32 shift, bank, cntr;
unsigned long flags;
- int shift, bank, cntr, retval;
- int max_banks = perf_iommu->max_banks;
- int max_cntrs = perf_iommu->max_counters;
+ int retval;
- raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+ raw_spin_lock_irqsave(&piommu->lock, flags);
for (bank = 0, shift = 0; bank < max_banks; bank++) {
for (cntr = 0; cntr < max_cntrs; cntr++) {
shift = bank + (bank*3) + cntr;
- if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
+ if (piommu->cntr_assign_mask & BIT_ULL(shift)) {
continue;
} else {
- perf_iommu->cntr_assign_mask |= (1ULL<<shift);
- retval = ((u16)((u16)bank<<8) | (u8)(cntr));
+ piommu->cntr_assign_mask |= BIT_ULL(shift);
+ event->hw.iommu_bank = bank;
+ event->hw.iommu_cntr = cntr;
+ retval = 0;
goto out;
}
}
}
retval = -ENOSPC;
out:
- raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+ raw_spin_unlock_irqrestore(&piommu->lock, flags);
return retval;
}
@@ -202,8 +210,6 @@ static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
static int perf_iommu_event_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- struct perf_amd_iommu *perf_iommu;
- u64 config, config1;
/* test the event attr type check for PMU enumeration */
if (event->attr.type != event->pmu->type)
@@ -225,80 +231,62 @@ static int perf_iommu_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
- perf_iommu = &__perf_iommu;
-
- if (event->pmu != &perf_iommu->pmu)
- return -ENOENT;
-
- if (perf_iommu) {
- config = event->attr.config;
- config1 = event->attr.config1;
- } else {
- return -EINVAL;
- }
-
- /* integrate with iommu base devid (0000), assume one iommu */
- perf_iommu->max_banks =
- amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
- perf_iommu->max_counters =
- amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
- if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
- return -EINVAL;
-
/* update the hw_perf_event struct with the iommu config data */
- hwc->config = config;
- hwc->extra_reg.config = config1;
+ hwc->conf = event->attr.config;
+ hwc->conf1 = event->attr.config1;
return 0;
}
+static inline struct amd_iommu *perf_event_2_iommu(struct perf_event *ev)
+{
+ return (container_of(ev->pmu, struct perf_amd_iommu, pmu))->iommu;
+}
+
static void perf_iommu_enable_event(struct perf_event *ev)
{
- u8 csource = _GET_CSOURCE(ev);
- u16 devid = _GET_DEVID(ev);
+ struct amd_iommu *iommu = perf_event_2_iommu(ev);
+ struct hw_perf_event *hwc = &ev->hw;
+ u8 bank = hwc->iommu_bank;
+ u8 cntr = hwc->iommu_cntr;
u64 reg = 0ULL;
- reg = csource;
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+ reg = GET_CSOURCE(hwc);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_COUNTER_SRC_REG, &reg);
- reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
+ reg = GET_DEVID_MASK(hwc);
+ reg = GET_DEVID(hwc) | (reg << 32);
if (reg)
- reg |= (1UL << 31);
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_DEVID_MATCH_REG, &reg, true);
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DEVID_MATCH_REG, &reg);
- reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+ reg = GET_PASID_MASK(hwc);
+ reg = GET_PASID(hwc) | (reg << 32);
if (reg)
- reg |= (1UL << 31);
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_PASID_MATCH_REG, &reg, true);
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_PASID_MATCH_REG, &reg);
- reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+ reg = GET_DOMID_MASK(hwc);
+ reg = GET_DOMID(hwc) | (reg << 32);
if (reg)
- reg |= (1UL << 31);
- amd_iommu_pc_get_set_reg_val(devid,
- _GET_BANK(ev), _GET_CNTR(ev) ,
- IOMMU_PC_DOMID_MATCH_REG, &reg, true);
+ reg |= BIT(31);
+ amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DOMID_MATCH_REG, &reg);
}
static void perf_iommu_disable_event(struct perf_event *event)
{
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
+ struct hw_perf_event *hwc = &event->hw;
u64 reg = 0ULL;
- amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
- _GET_BANK(event), _GET_CNTR(event),
- IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+ amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_SRC_REG, &reg);
}
static void perf_iommu_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
- pr_debug("perf: amd_iommu:perf_iommu_start\n");
if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
return;
@@ -306,10 +294,11 @@ static void perf_iommu_start(struct perf_event *event, int flags)
hwc->state = 0;
if (flags & PERF_EF_RELOAD) {
- u64 prev_raw_count = local64_read(&hwc->prev_count);
- amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
- _GET_BANK(event), _GET_CNTR(event),
- IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
+ u64 prev_raw_count = local64_read(&hwc->prev_count);
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
+
+ amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_REG, &prev_raw_count);
}
perf_iommu_enable_event(event);
@@ -319,37 +308,30 @@ static void perf_iommu_start(struct perf_event *event, int flags)
static void perf_iommu_read(struct perf_event *event)
{
- u64 count = 0ULL;
- u64 prev_raw_count = 0ULL;
- u64 delta = 0ULL;
+ u64 count, prev, delta;
struct hw_perf_event *hwc = &event->hw;
- pr_debug("perf: amd_iommu:perf_iommu_read\n");
+ struct amd_iommu *iommu = perf_event_2_iommu(event);
- amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
- _GET_BANK(event), _GET_CNTR(event),
- IOMMU_PC_COUNTER_REG, &count, false);
+ if (amd_iommu_pc_get_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
+ IOMMU_PC_COUNTER_REG, &count))
+ return;
/* IOMMU pc counter register is only 48 bits */
- count &= 0xFFFFFFFFFFFFULL;
+ count &= GENMASK_ULL(47, 0);
- prev_raw_count = local64_read(&hwc->prev_count);
- if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
- count) != prev_raw_count)
+ prev = local64_read(&hwc->prev_count);
+ if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev)
return;
- /* Handling 48-bit counter overflowing */
- delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
+ /* Handle 48-bit counter overflow */
+ delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
delta >>= COUNTER_SHIFT;
local64_add(delta, &event->count);
-
}
static void perf_iommu_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
- u64 config;
-
- pr_debug("perf: amd_iommu:perf_iommu_stop\n");
if (hwc->state & PERF_HES_UPTODATE)
return;
@@ -361,7 +343,6 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
if (hwc->state & PERF_HES_UPTODATE)
return;
- config = hwc->config;
perf_iommu_read(event);
hwc->state |= PERF_HES_UPTODATE;
}
@@ -369,17 +350,12 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
static int perf_iommu_add(struct perf_event *event, int flags)
{
int retval;
- struct perf_amd_iommu *perf_iommu =
- container_of(event->pmu, struct perf_amd_iommu, pmu);
- pr_debug("perf: amd_iommu:perf_iommu_add\n");
event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
/* request an iommu bank/counter */
- retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
- if (retval != -ENOSPC)
- event->hw.extra_reg.reg = (u16)retval;
- else
+ retval = get_next_avail_iommu_bnk_cntr(event);
+ if (retval)
return retval;
if (flags & PERF_EF_START)
@@ -390,115 +366,124 @@ static int perf_iommu_add(struct perf_event *event, int flags)
static void perf_iommu_del(struct perf_event *event, int flags)
{
+ struct hw_perf_event *hwc = &event->hw;
struct perf_amd_iommu *perf_iommu =
container_of(event->pmu, struct perf_amd_iommu, pmu);
- pr_debug("perf: amd_iommu:perf_iommu_del\n");
perf_iommu_stop(event, PERF_EF_UPDATE);
/* clear the assigned iommu bank/counter */
clear_avail_iommu_bnk_cntr(perf_iommu,
- _GET_BANK(event),
- _GET_CNTR(event));
+ hwc->iommu_bank, hwc->iommu_cntr);
perf_event_update_userpage(event);
}
-static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
+static __init int _init_events_attrs(void)
{
- struct attribute **attrs;
- struct attribute_group *attr_group;
int i = 0, j;
+ struct attribute **attrs;
while (amd_iommu_v2_event_descs[i].attr.attr.name)
i++;
- attr_group = kzalloc(sizeof(struct attribute *)
- * (i + 1) + sizeof(*attr_group), GFP_KERNEL);
- if (!attr_group)
+ attrs = kzalloc(sizeof(struct attribute **) * (i + 1), GFP_KERNEL);
+ if (!attrs)
return -ENOMEM;
- attrs = (struct attribute **)(attr_group + 1);
for (j = 0; j < i; j++)
attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
- attr_group->name = "events";
- attr_group->attrs = attrs;
- perf_iommu->events_group = attr_group;
-
+ amd_iommu_events_group.attrs = attrs;
return 0;
}
-static __init void amd_iommu_pc_exit(void)
-{
- if (__perf_iommu.events_group != NULL) {
- kfree(__perf_iommu.events_group);
- __perf_iommu.events_group = NULL;
- }
-}
+const struct attribute_group *amd_iommu_attr_groups[] = {
+ &amd_iommu_format_group,
+ &amd_iommu_cpumask_group,
+ &amd_iommu_events_group,
+ NULL,
+};
+
+static struct pmu iommu_pmu = {
+ .event_init = perf_iommu_event_init,
+ .add = perf_iommu_add,
+ .del = perf_iommu_del,
+ .start = perf_iommu_start,
+ .stop = perf_iommu_stop,
+ .read = perf_iommu_read,
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = amd_iommu_attr_groups,
+};
-static __init int _init_perf_amd_iommu(
- struct perf_amd_iommu *perf_iommu, char *name)
+static __init int init_one_iommu(unsigned int idx)
{
+ struct perf_amd_iommu *perf_iommu;
int ret;
+ perf_iommu = kzalloc(sizeof(struct perf_amd_iommu), GFP_KERNEL);
+ if (!perf_iommu)
+ return -ENOMEM;
+
raw_spin_lock_init(&perf_iommu->lock);
- /* Init format attributes */
- perf_iommu->format_group = &amd_iommu_format_group;
+ perf_iommu->pmu = iommu_pmu;
+ perf_iommu->iommu = get_amd_iommu(idx);
+ perf_iommu->max_banks = amd_iommu_pc_get_max_banks(idx);
+ perf_iommu->max_counters = amd_iommu_pc_get_max_counters(idx);
- /* Init cpumask attributes to only core 0 */
- cpumask_set_cpu(0, &iommu_cpumask);
- perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
-
- /* Init events attributes */
- if (_init_events_attrs(perf_iommu) != 0)
- pr_err("perf: amd_iommu: Only support raw events.\n");
+ if (!perf_iommu->iommu ||
+ !perf_iommu->max_banks ||
+ !perf_iommu->max_counters) {
+ kfree(perf_iommu);
+ return -EINVAL;
+ }
- /* Init null attributes */
- perf_iommu->null_group = NULL;
- perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
+ snprintf(perf_iommu->name, IOMMU_NAME_SIZE, "amd_iommu_%u", idx);
- ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
- if (ret) {
- pr_err("perf: amd_iommu: Failed to initialized.\n");
- amd_iommu_pc_exit();
+ ret = perf_pmu_register(&perf_iommu->pmu, perf_iommu->name, -1);
+ if (!ret) {
+ pr_info("Detected AMD IOMMU #%d (%d banks, %d counters/bank).\n",
+ idx, perf_iommu->max_banks, perf_iommu->max_counters);
+ list_add_tail(&perf_iommu->list, &perf_amd_iommu_list);
} else {
- pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
- amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
- amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
+ pr_warn("Error initializing IOMMU %d.\n", idx);
+ kfree(perf_iommu);
}
-
return ret;
}
-static struct perf_amd_iommu __perf_iommu = {
- .pmu = {
- .task_ctx_nr = perf_invalid_context,
- .event_init = perf_iommu_event_init,
- .add = perf_iommu_add,
- .del = perf_iommu_del,
- .start = perf_iommu_start,
- .stop = perf_iommu_stop,
- .read = perf_iommu_read,
- },
- .max_banks = 0x00,
- .max_counters = 0x00,
- .cntr_assign_mask = 0ULL,
- .format_group = NULL,
- .cpumask_group = NULL,
- .events_group = NULL,
- .null_group = NULL,
-};
-
static __init int amd_iommu_pc_init(void)
{
+ unsigned int i, cnt = 0;
+ int ret;
+
/* Make sure the IOMMU PC resource is available */
if (!amd_iommu_pc_supported())
return -ENODEV;
- _init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
+ ret = _init_events_attrs();
+ if (ret)
+ return ret;
+
+ /*
+ * An IOMMU PMU is specific to an IOMMU, and can function independently.
+ * So we go through all IOMMUs and ignore the one that fails init
+ * unless all IOMMU are failing.
+ */
+ for (i = 0; i < amd_iommu_get_num_iommus(); i++) {
+ ret = init_one_iommu(i);
+ if (!ret)
+ cnt++;
+ }
+
+ if (!cnt) {
+ kfree(amd_iommu_events_group.attrs);
+ return -ENODEV;
+ }
+ /* Init cpumask attributes to only core 0 */
+ cpumask_set_cpu(0, &iommu_cpumask);
return 0;
}
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
index 845d173278e3..62e0702c4374 100644
--- a/arch/x86/events/amd/iommu.h
+++ b/arch/x86/events/amd/iommu.h
@@ -24,17 +24,23 @@
#define PC_MAX_SPEC_BNKS 64
#define PC_MAX_SPEC_CNTRS 16
-/* iommu pc reg masks*/
-#define IOMMU_BASE_DEVID 0x0000
+struct amd_iommu;
/* amd_iommu_init.c external support functions */
+extern int amd_iommu_get_num_iommus(void);
+
extern bool amd_iommu_pc_supported(void);
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+extern u8 amd_iommu_pc_get_max_banks(unsigned int idx);
+
+extern u8 amd_iommu_pc_get_max_counters(unsigned int idx);
+
+extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+ u8 fxn, u64 *value);
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+ u8 fxn, u64 *value);
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
- u8 fxn, u64 *value, bool is_write);
+extern struct amd_iommu *get_amd_iommu(int idx);
#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 982c9e31daca..8ae8c5ce3a1f 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -63,7 +63,6 @@ struct bts_buffer {
unsigned int cur_buf;
bool snapshot;
local_t data_size;
- local_t lost;
local_t head;
unsigned long end;
void **data_pages;
@@ -199,7 +198,8 @@ static void bts_update(struct bts_ctx *bts)
return;
if (ds->bts_index >= ds->bts_absolute_maximum)
- local_inc(&buf->lost);
+ perf_aux_output_flag(&bts->handle,
+ PERF_AUX_FLAG_TRUNCATED);
/*
* old and head are always in the same physical buffer, so we
@@ -276,7 +276,7 @@ static void bts_event_start(struct perf_event *event, int flags)
return;
fail_end_stop:
- perf_aux_output_end(&bts->handle, 0, false);
+ perf_aux_output_end(&bts->handle, 0);
fail_stop:
event->hw.state = PERF_HES_STOPPED;
@@ -319,9 +319,8 @@ static void bts_event_stop(struct perf_event *event, int flags)
bts->handle.head =
local_xchg(&buf->data_size,
buf->nr_pages << PAGE_SHIFT);
-
- perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
- !!local_xchg(&buf->lost, 0));
+ perf_aux_output_end(&bts->handle,
+ local_xchg(&buf->data_size, 0));
}
cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
@@ -484,8 +483,7 @@ int intel_bts_interrupt(void)
if (old_head == local_read(&buf->head))
return handled;
- perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
- !!local_xchg(&buf->lost, 0));
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0));
buf = perf_aux_output_begin(&bts->handle, event);
if (buf)
@@ -500,7 +498,7 @@ int intel_bts_interrupt(void)
* cleared handle::event
*/
barrier();
- perf_aux_output_end(&bts->handle, 0, false);
+ perf_aux_output_end(&bts->handle, 0);
}
}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index eb1484c86bb4..4244bed77824 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1553,6 +1553,27 @@ static __initconst const u64 slm_hw_cache_event_ids
},
};
+EVENT_ATTR_STR(topdown-total-slots, td_total_slots_glm, "event=0x3c");
+EVENT_ATTR_STR(topdown-total-slots.scale, td_total_slots_scale_glm, "3");
+/* UOPS_NOT_DELIVERED.ANY */
+EVENT_ATTR_STR(topdown-fetch-bubbles, td_fetch_bubbles_glm, "event=0x9c");
+/* ISSUE_SLOTS_NOT_CONSUMED.RECOVERY */
+EVENT_ATTR_STR(topdown-recovery-bubbles, td_recovery_bubbles_glm, "event=0xca,umask=0x02");
+/* UOPS_RETIRED.ANY */
+EVENT_ATTR_STR(topdown-slots-retired, td_slots_retired_glm, "event=0xc2");
+/* UOPS_ISSUED.ANY */
+EVENT_ATTR_STR(topdown-slots-issued, td_slots_issued_glm, "event=0x0e");
+
+static struct attribute *glm_events_attrs[] = {
+ EVENT_PTR(td_total_slots_glm),
+ EVENT_PTR(td_total_slots_scale_glm),
+ EVENT_PTR(td_fetch_bubbles_glm),
+ EVENT_PTR(td_recovery_bubbles_glm),
+ EVENT_PTR(td_slots_issued_glm),
+ EVENT_PTR(td_slots_retired_glm),
+ NULL
+};
+
static struct extra_reg intel_glm_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x760005ffbfull, RSP_0),
@@ -3750,6 +3771,7 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_prec_dist = true;
x86_pmu.lbr_pt_coexist = true;
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.cpu_events = glm_events_attrs;
pr_cont("Goldmont events, ");
break;
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 5900471ee508..ae8324d65e61 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -28,6 +28,7 @@
#include <asm/insn.h>
#include <asm/io.h>
#include <asm/intel_pt.h>
+#include <asm/intel-family.h>
#include "../perf_event.h"
#include "pt.h"
@@ -98,6 +99,7 @@ static struct attribute_group pt_cap_group = {
.name = "caps",
};
+PMU_FORMAT_ATTR(pt, "config:0" );
PMU_FORMAT_ATTR(cyc, "config:1" );
PMU_FORMAT_ATTR(pwr_evt, "config:4" );
PMU_FORMAT_ATTR(fup_on_ptw, "config:5" );
@@ -105,11 +107,13 @@ PMU_FORMAT_ATTR(mtc, "config:9" );
PMU_FORMAT_ATTR(tsc, "config:10" );
PMU_FORMAT_ATTR(noretcomp, "config:11" );
PMU_FORMAT_ATTR(ptw, "config:12" );
+PMU_FORMAT_ATTR(branch, "config:13" );
PMU_FORMAT_ATTR(mtc_period, "config:14-17" );
PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" );
PMU_FORMAT_ATTR(psb_period, "config:24-27" );
static struct attribute *pt_formats_attr[] = {
+ &format_attr_pt.attr,
&format_attr_cyc.attr,
&format_attr_pwr_evt.attr,
&format_attr_fup_on_ptw.attr,
@@ -117,6 +121,7 @@ static struct attribute *pt_formats_attr[] = {
&format_attr_tsc.attr,
&format_attr_noretcomp.attr,
&format_attr_ptw.attr,
+ &format_attr_branch.attr,
&format_attr_mtc_period.attr,
&format_attr_cyc_thresh.attr,
&format_attr_psb_period.attr,
@@ -197,6 +202,19 @@ static int __init pt_pmu_hw_init(void)
pt_pmu.tsc_art_den = eax;
}
+ /* model-specific quirks */
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_BROADWELL_CORE:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_GT3E:
+ case INTEL_FAM6_BROADWELL_X:
+ /* not setting BRANCH_EN will #GP, erratum BDM106 */
+ pt_pmu.branch_en_always_on = true;
+ break;
+ default:
+ break;
+ }
+
if (boot_cpu_has(X86_FEATURE_VMX)) {
/*
* Intel SDM, 36.5 "Tracing post-VMXON" says that
@@ -263,8 +281,20 @@ fail:
#define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \
RTIT_CTL_FUP_ON_PTW)
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \
+/*
+ * Bit 0 (TraceEn) in the attr.config is meaningless as the
+ * corresponding bit in the RTIT_CTL can only be controlled
+ * by the driver; therefore, repurpose it to mean: pass
+ * through the bit that was previously assumed to be always
+ * on for PT, thereby allowing the user to *not* set it if
+ * they so wish. See also pt_event_valid() and pt_config().
+ */
+#define RTIT_CTL_PASSTHROUGH RTIT_CTL_TRACEEN
+
+#define PT_CONFIG_MASK (RTIT_CTL_TRACEEN | \
+ RTIT_CTL_TSC_EN | \
RTIT_CTL_DISRETC | \
+ RTIT_CTL_BRANCH_EN | \
RTIT_CTL_CYC_PSB | \
RTIT_CTL_MTC | \
RTIT_CTL_PWR_EVT_EN | \
@@ -332,6 +362,33 @@ static bool pt_event_valid(struct perf_event *event)
return false;
}
+ /*
+ * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
+ * clears the assomption that BranchEn must always be enabled,
+ * as was the case with the first implementation of PT.
+ * If this bit is not set, the legacy behavior is preserved
+ * for compatibility with the older userspace.
+ *
+ * Re-using bit 0 for this purpose is fine because it is never
+ * directly set by the user; previous attempts at setting it in
+ * the attr.config resulted in -EINVAL.
+ */
+ if (config & RTIT_CTL_PASSTHROUGH) {
+ /*
+ * Disallow not setting BRANCH_EN where BRANCH_EN is
+ * always required.
+ */
+ if (pt_pmu.branch_en_always_on &&
+ !(config & RTIT_CTL_BRANCH_EN))
+ return false;
+ } else {
+ /*
+ * Disallow BRANCH_EN without the PASSTHROUGH.
+ */
+ if (config & RTIT_CTL_BRANCH_EN)
+ return false;
+ }
+
return true;
}
@@ -411,6 +468,7 @@ static u64 pt_config_filters(struct perf_event *event)
static void pt_config(struct perf_event *event)
{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
u64 reg;
if (!event->hw.itrace_started) {
@@ -419,7 +477,20 @@ static void pt_config(struct perf_event *event)
}
reg = pt_config_filters(event);
- reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+ reg |= RTIT_CTL_TOPA | RTIT_CTL_TRACEEN;
+
+ /*
+ * Previously, we had BRANCH_EN on by default, but now that PT has
+ * grown features outside of branch tracing, it is useful to allow
+ * the user to disable it. Setting bit 0 in the event's attr.config
+ * allows BRANCH_EN to pass through instead of being always on. See
+ * also the comment in pt_event_valid().
+ */
+ if (event->attr.config & BIT(0)) {
+ reg |= event->attr.config & RTIT_CTL_BRANCH_EN;
+ } else {
+ reg |= RTIT_CTL_BRANCH_EN;
+ }
if (!event->attr.exclude_kernel)
reg |= RTIT_CTL_OS;
@@ -429,11 +500,15 @@ static void pt_config(struct perf_event *event)
reg |= (event->attr.config & PT_CONFIG_MASK);
event->hw.config = reg;
- wrmsrl(MSR_IA32_RTIT_CTL, reg);
+ if (READ_ONCE(pt->vmx_on))
+ perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL);
+ else
+ wrmsrl(MSR_IA32_RTIT_CTL, reg);
}
static void pt_config_stop(struct perf_event *event)
{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
u64 ctl = READ_ONCE(event->hw.config);
/* may be already stopped by a PMI */
@@ -441,7 +516,8 @@ static void pt_config_stop(struct perf_event *event)
return;
ctl &= ~RTIT_CTL_TRACEEN;
- wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+ if (!READ_ONCE(pt->vmx_on))
+ wrmsrl(MSR_IA32_RTIT_CTL, ctl);
WRITE_ONCE(event->hw.config, ctl);
@@ -753,7 +829,8 @@ static void pt_handle_status(struct pt *pt)
*/
if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
- local_inc(&buf->lost);
+ perf_aux_output_flag(&pt->handle,
+ PERF_AUX_FLAG_TRUNCATED);
advance++;
}
}
@@ -846,8 +923,10 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
/* can't stop in the middle of an output region */
if (buf->output_off + handle->size + 1 <
- sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+ sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+ perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
return -EINVAL;
+ }
/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
@@ -1171,12 +1250,6 @@ void intel_pt_interrupt(void)
if (!READ_ONCE(pt->handle_nmi))
return;
- /*
- * If VMX is on and PT does not support it, don't touch anything.
- */
- if (READ_ONCE(pt->vmx_on))
- return;
-
if (!event)
return;
@@ -1192,8 +1265,7 @@ void intel_pt_interrupt(void)
pt_update_head(pt);
- perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
- local_xchg(&buf->lost, 0));
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
if (!event->hw.state) {
int ret;
@@ -1208,7 +1280,7 @@ void intel_pt_interrupt(void)
/* snapshot counters don't use PMI, so it's safe */
ret = pt_buffer_reset_markers(buf, &pt->handle);
if (ret) {
- perf_aux_output_end(&pt->handle, 0, true);
+ perf_aux_output_end(&pt->handle, 0);
return;
}
@@ -1237,12 +1309,19 @@ void intel_pt_handle_vmx(int on)
local_irq_save(flags);
WRITE_ONCE(pt->vmx_on, on);
- if (on) {
- /* prevent pt_config_stop() from writing RTIT_CTL */
- event = pt->handle.event;
- if (event)
- event->hw.config = 0;
- }
+ /*
+ * If an AUX transaction is in progress, it will contain
+ * gap(s), so flag it PARTIAL to inform the user.
+ */
+ event = pt->handle.event;
+ if (event)
+ perf_aux_output_flag(&pt->handle,
+ PERF_AUX_FLAG_PARTIAL);
+
+ /* Turn PTs back on */
+ if (!on && event)
+ wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
+
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
@@ -1257,9 +1336,6 @@ static void pt_event_start(struct perf_event *event, int mode)
struct pt *pt = this_cpu_ptr(&pt_ctx);
struct pt_buffer *buf;
- if (READ_ONCE(pt->vmx_on))
- return;
-
buf = perf_aux_output_begin(&pt->handle, event);
if (!buf)
goto fail_stop;
@@ -1280,7 +1356,7 @@ static void pt_event_start(struct perf_event *event, int mode)
return;
fail_end_stop:
- perf_aux_output_end(&pt->handle, 0, true);
+ perf_aux_output_end(&pt->handle, 0);
fail_stop:
hwc->state = PERF_HES_STOPPED;
}
@@ -1321,8 +1397,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
pt->handle.head =
local_xchg(&buf->data_size,
buf->nr_pages << PAGE_SHIFT);
- perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
- local_xchg(&buf->lost, 0));
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
}
}
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 53473c21b554..0eb41d07b79a 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -110,6 +110,7 @@ struct pt_pmu {
struct pmu pmu;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
bool vmx;
+ bool branch_en_always_on;
unsigned long max_nonturbo_ratio;
unsigned int tsc_art_num;
unsigned int tsc_art_den;
@@ -143,7 +144,6 @@ struct pt_buffer {
size_t output_off;
unsigned long nr_pages;
local_t data_size;
- local_t lost;
local64_t head;
bool snapshot;
unsigned long stop_pos, intr_pos;
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 8bef70e7f3cc..2b01421f7d0f 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -27,45 +27,22 @@
#include <linux/clockchips.h>
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_HYPERV_TSCPAGE
static struct ms_hyperv_tsc_page *tsc_pg;
+struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
+{
+ return tsc_pg;
+}
+
static u64 read_hv_clock_tsc(struct clocksource *arg)
{
- u64 current_tick;
+ u64 current_tick = hv_read_tsc_page(tsc_pg);
+
+ if (current_tick == U64_MAX)
+ rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
- if (tsc_pg->tsc_sequence != 0) {
- /*
- * Use the tsc page to compute the value.
- */
-
- while (1) {
- u64 tmp;
- u32 sequence = tsc_pg->tsc_sequence;
- u64 cur_tsc;
- u64 scale = tsc_pg->tsc_scale;
- s64 offset = tsc_pg->tsc_offset;
-
- rdtscll(cur_tsc);
- /* current_tick = ((cur_tsc *scale) >> 64) + offset */
- asm("mulq %3"
- : "=d" (current_tick), "=a" (tmp)
- : "a" (cur_tsc), "r" (scale));
-
- current_tick += offset;
- if (tsc_pg->tsc_sequence == sequence)
- return current_tick;
-
- if (tsc_pg->tsc_sequence != 0)
- continue;
- /*
- * Fallback using MSR method.
- */
- break;
- }
- }
- rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
return current_tick;
}
@@ -139,7 +116,7 @@ void hyperv_init(void)
/*
* Register Hyper-V specific clocksource.
*/
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_HYPERV_TSCPAGE
if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) {
union hv_x64_msr_hypercall_contents tsc_msr;
@@ -155,6 +132,9 @@ void hyperv_init(void)
tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg);
wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+
+ hyperv_cs_tsc.archdata.vclock_mode = VCLOCK_HVCLOCK;
+
clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100);
return;
}
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 730ef65e8393..bdffcd9eab2b 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -252,12 +252,6 @@ static inline int x2apic_enabled(void) { return 0; }
#define x2apic_supported() (0)
#endif /* !CONFIG_X86_X2APIC */
-#ifdef CONFIG_X86_64
-#define SET_APIC_ID(x) (apic->set_apic_id(x))
-#else
-
-#endif
-
/*
* Copyright 2004 James Cleverdon, IBM.
* Subject to the GNU Public License, v.2
@@ -299,6 +293,7 @@ struct apic {
int (*phys_pkg_id)(int cpuid_apic, int index_msb);
unsigned int (*get_apic_id)(unsigned long x);
+ /* Can't be NULL on 64-bit */
unsigned long (*set_apic_id)(unsigned int id);
int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 14635c5ea025..caa5798c92f4 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -186,6 +186,12 @@ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
return cmpxchg(&v->counter, old, new);
}
+#define atomic_try_cmpxchg atomic_try_cmpxchg
+static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new)
+{
+ return try_cmpxchg(&v->counter, old, new);
+}
+
static inline int atomic_xchg(atomic_t *v, int new)
{
return xchg(&v->counter, new);
@@ -201,16 +207,12 @@ static inline void atomic_##op(int i, atomic_t *v) \
}
#define ATOMIC_FETCH_OP(op, c_op) \
-static inline int atomic_fetch_##op(int i, atomic_t *v) \
+static inline int atomic_fetch_##op(int i, atomic_t *v) \
{ \
- int old, val = atomic_read(v); \
- for (;;) { \
- old = atomic_cmpxchg(v, val, val c_op i); \
- if (old == val) \
- break; \
- val = old; \
- } \
- return old; \
+ int val = atomic_read(v); \
+ do { \
+ } while (!atomic_try_cmpxchg(v, &val, val c_op i)); \
+ return val; \
}
#define ATOMIC_OPS(op, c_op) \
@@ -236,16 +238,11 @@ ATOMIC_OPS(xor, ^)
*/
static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
{
- int c, old;
- c = atomic_read(v);
- for (;;) {
- if (unlikely(c == (u)))
- break;
- old = atomic_cmpxchg((v), c, c + (a));
- if (likely(old == c))
+ int c = atomic_read(v);
+ do {
+ if (unlikely(c == u))
break;
- c = old;
- }
+ } while (!atomic_try_cmpxchg(v, &c, c + a));
return c;
}
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 89ed2f6ae2f7..6189a433c9a9 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -176,6 +176,12 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
return cmpxchg(&v->counter, old, new);
}
+#define atomic64_try_cmpxchg atomic64_try_cmpxchg
+static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new)
+{
+ return try_cmpxchg(&v->counter, old, new);
+}
+
static inline long atomic64_xchg(atomic64_t *v, long new)
{
return xchg(&v->counter, new);
@@ -192,17 +198,12 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
*/
static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
{
- long c, old;
- c = atomic64_read(v);
- for (;;) {
- if (unlikely(c == (u)))
- break;
- old = atomic64_cmpxchg((v), c, c + (a));
- if (likely(old == c))
- break;
- c = old;
- }
- return c != (u);
+ long c = atomic64_read(v);
+ do {
+ if (unlikely(c == u))
+ return false;
+ } while (!atomic64_try_cmpxchg(v, &c, c + a));
+ return true;
}
#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
@@ -216,17 +217,12 @@ static inline bool atomic64_add_unless(atomic64_t *v, long a, long u)
*/
static inline long atomic64_dec_if_positive(atomic64_t *v)
{
- long c, old, dec;
- c = atomic64_read(v);
- for (;;) {
+ long dec, c = atomic64_read(v);
+ do {
dec = c - 1;
if (unlikely(dec < 0))
break;
- old = atomic64_cmpxchg((v), c, dec);
- if (likely(old == c))
- break;
- c = old;
- }
+ } while (!atomic64_try_cmpxchg(v, &c, dec));
return dec;
}
@@ -242,14 +238,10 @@ static inline void atomic64_##op(long i, atomic64_t *v) \
#define ATOMIC64_FETCH_OP(op, c_op) \
static inline long atomic64_fetch_##op(long i, atomic64_t *v) \
{ \
- long old, val = atomic64_read(v); \
- for (;;) { \
- old = atomic64_cmpxchg(v, val, val c_op i); \
- if (old == val) \
- break; \
- val = old; \
- } \
- return old; \
+ long val = atomic64_read(v); \
+ do { \
+ } while (!atomic64_try_cmpxchg(v, &val, val c_op i)); \
+ return val; \
}
#define ATOMIC64_OPS(op, c_op) \
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index ba38ebbaced3..39e702d90cdb 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -1,36 +1,82 @@
#ifndef _ASM_X86_BUG_H
#define _ASM_X86_BUG_H
-#define HAVE_ARCH_BUG
+#include <linux/stringify.h>
-#ifdef CONFIG_DEBUG_BUGVERBOSE
+/*
+ * Since some emulators terminate on UD2, we cannot use it for WARN.
+ * Since various instruction decoders disagree on the length of UD1,
+ * we cannot use it either. So use UD0 for WARN.
+ *
+ * (binutils knows about "ud1" but {en,de}codes it as 2 bytes, whereas
+ * our kernel decoder thinks it takes a ModRM byte, which seems consistent
+ * with various things like the Intel SDM instruction encoding rules)
+ */
+
+#define ASM_UD0 ".byte 0x0f, 0xff"
+#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */
+#define ASM_UD2 ".byte 0x0f, 0x0b"
+
+#define INSN_UD0 0xff0f
+#define INSN_UD2 0x0b0f
+
+#define LEN_UD0 2
+
+#ifdef CONFIG_GENERIC_BUG
#ifdef CONFIG_X86_32
-# define __BUG_C0 "2:\t.long 1b, %c0\n"
+# define __BUG_REL(val) ".long " __stringify(val)
#else
-# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n"
+# define __BUG_REL(val) ".long " __stringify(val) " - 2b"
#endif
-#define BUG() \
-do { \
- asm volatile("1:\tud2\n" \
- ".pushsection __bug_table,\"a\"\n" \
- __BUG_C0 \
- "\t.word %c1, 0\n" \
- "\t.org 2b+%c2\n" \
- ".popsection" \
- : : "i" (__FILE__), "i" (__LINE__), \
- "i" (sizeof(struct bug_entry))); \
- unreachable(); \
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+
+#define _BUG_FLAGS(ins, flags) \
+do { \
+ asm volatile("1:\t" ins "\n" \
+ ".pushsection __bug_table,\"a\"\n" \
+ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \
+ "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \
+ "\t.word %c1" "\t# bug_entry::line\n" \
+ "\t.word %c2" "\t# bug_entry::flags\n" \
+ "\t.org 2b+%c3\n" \
+ ".popsection" \
+ : : "i" (__FILE__), "i" (__LINE__), \
+ "i" (flags), \
+ "i" (sizeof(struct bug_entry))); \
} while (0)
+#else /* !CONFIG_DEBUG_BUGVERBOSE */
+
+#define _BUG_FLAGS(ins, flags) \
+do { \
+ asm volatile("1:\t" ins "\n" \
+ ".pushsection __bug_table,\"a\"\n" \
+ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \
+ "\t.word %c0" "\t# bug_entry::flags\n" \
+ "\t.org 2b+%c1\n" \
+ ".popsection" \
+ : : "i" (flags), \
+ "i" (sizeof(struct bug_entry))); \
+} while (0)
+
+#endif /* CONFIG_DEBUG_BUGVERBOSE */
+
#else
+
+#define _BUG_FLAGS(ins, flags) asm volatile(ins)
+
+#endif /* CONFIG_GENERIC_BUG */
+
+#define HAVE_ARCH_BUG
#define BUG() \
do { \
- asm volatile("ud2"); \
+ _BUG_FLAGS(ASM_UD2, 0); \
unreachable(); \
} while (0)
-#endif
+
+#define __WARN_FLAGS(flags) _BUG_FLAGS(ASM_UD0, BUGFLAG_WARNING|(flags))
#include <asm-generic/bug.h>
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index eae33c7170c8..47bea8cadbd0 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -6,7 +6,8 @@
#define VCLOCK_NONE 0 /* No vDSO clock available. */
#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
#define VCLOCK_PVCLOCK 2 /* vDSO should use vread_pvclock. */
-#define VCLOCK_MAX 2
+#define VCLOCK_HVCLOCK 3 /* vDSO should use vread_hvclock. */
+#define VCLOCK_MAX 3
struct arch_clocksource_data {
int vclock_mode;
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 97848cdfcb1a..d90296d061e8 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -153,6 +153,76 @@ extern void __add_wrong_size(void)
#define cmpxchg_local(ptr, old, new) \
__cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
+
+#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock) \
+({ \
+ bool success; \
+ __typeof__(_ptr) _old = (_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ switch (size) { \
+ case __X86_CASE_B: \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(_ptr); \
+ asm volatile(lock "cmpxchgb %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "q" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_W: \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(_ptr); \
+ asm volatile(lock "cmpxchgw %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_L: \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(_ptr); \
+ asm volatile(lock "cmpxchgl %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_Q: \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(_ptr); \
+ asm volatile(lock "cmpxchgq %[new], %[ptr]" \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*__ptr), \
+ [old] "+a" (__old) \
+ : [new] "r" (__new) \
+ : "memory"); \
+ break; \
+ } \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); \
+})
+
+#define __try_cmpxchg(ptr, pold, new, size) \
+ __raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
+
+#define try_cmpxchg(ptr, pold, new) \
+ __try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+
/*
* xadd() adds "inc" to "*ptr" and atomically returns the previous
* value of "*ptr".
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index b04bb6dfed7f..0fe00446f9ca 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -187,6 +187,7 @@
* Reuse free bits when adding new feature flags!
*/
#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
+#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 1548ca92ad3f..d0a21b12dd58 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -4,6 +4,7 @@
#include <asm/desc_defs.h>
#include <asm/ldt.h>
#include <asm/mmu.h>
+#include <asm/fixmap.h>
#include <linux/smp.h>
#include <linux/percpu.h>
@@ -45,11 +46,43 @@ struct gdt_page {
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+/* Provide the original GDT */
+static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu)
{
return per_cpu(gdt_page, cpu).gdt;
}
+/* Provide the current original GDT */
+static inline struct desc_struct *get_current_gdt_rw(void)
+{
+ return this_cpu_ptr(&gdt_page)->gdt;
+}
+
+/* Get the fixmap index for a specific processor */
+static inline unsigned int get_cpu_gdt_ro_index(int cpu)
+{
+ return FIX_GDT_REMAP_BEGIN + cpu;
+}
+
+/* Provide the fixmap address of the remapped GDT */
+static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
+{
+ unsigned int idx = get_cpu_gdt_ro_index(cpu);
+ return (struct desc_struct *)__fix_to_virt(idx);
+}
+
+/* Provide the current read-only GDT */
+static inline struct desc_struct *get_current_gdt_ro(void)
+{
+ return get_cpu_gdt_ro(smp_processor_id());
+}
+
+/* Provide the physical address of the GDT page. */
+static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu)
+{
+ return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu));
+}
+
#ifdef CONFIG_X86_64
static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
@@ -174,7 +207,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t
static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
{
- struct desc_struct *d = get_cpu_gdt_table(cpu);
+ struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss;
set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
@@ -194,22 +227,90 @@ static inline void native_set_ldt(const void *addr, unsigned int entries)
set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
entries * LDT_ENTRY_SIZE - 1);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
+ write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT,
&ldt, DESC_LDT);
asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
}
}
+static inline void native_load_gdt(const struct desc_ptr *dtr)
+{
+ asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static inline void native_load_idt(const struct desc_ptr *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static inline void native_store_gdt(struct desc_ptr *dtr)
+{
+ asm volatile("sgdt %0":"=m" (*dtr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+ asm volatile("sidt %0":"=m" (*dtr));
+}
+
+/*
+ * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is
+ * a read-only remapping. To prevent a page fault, the GDT is switched to the
+ * original writeable version when needed.
+ */
+#ifdef CONFIG_X86_64
+static inline void native_load_tr_desc(void)
+{
+ struct desc_ptr gdt;
+ int cpu = raw_smp_processor_id();
+ bool restore = 0;
+ struct desc_struct *fixmap_gdt;
+
+ native_store_gdt(&gdt);
+ fixmap_gdt = get_cpu_gdt_ro(cpu);
+
+ /*
+ * If the current GDT is the read-only fixmap, swap to the original
+ * writeable version. Swap back at the end.
+ */
+ if (gdt.address == (unsigned long)fixmap_gdt) {
+ load_direct_gdt(cpu);
+ restore = 1;
+ }
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+ if (restore)
+ load_fixmap_gdt(cpu);
+}
+#else
static inline void native_load_tr_desc(void)
{
asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
}
+#endif
+
+static inline unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+
+ asm volatile("str %0":"=r" (tr));
+
+ return tr;
+}
+
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+ struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
+ unsigned int i;
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
+}
DECLARE_PER_CPU(bool, __tss_limit_invalid);
static inline void force_reload_TR(void)
{
- struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
+ struct desc_struct *d = get_current_gdt_rw();
tss_desc tss;
memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
@@ -257,44 +358,6 @@ static inline void invalidate_tss_limit(void)
this_cpu_write(__tss_limit_invalid, true);
}
-static inline void native_load_gdt(const struct desc_ptr *dtr)
-{
- asm volatile("lgdt %0"::"m" (*dtr));
-}
-
-static inline void native_load_idt(const struct desc_ptr *dtr)
-{
- asm volatile("lidt %0"::"m" (*dtr));
-}
-
-static inline void native_store_gdt(struct desc_ptr *dtr)
-{
- asm volatile("sgdt %0":"=m" (*dtr));
-}
-
-static inline void native_store_idt(struct desc_ptr *dtr)
-{
- asm volatile("sidt %0":"=m" (*dtr));
-}
-
-static inline unsigned long native_store_tr(void)
-{
- unsigned long tr;
-
- asm volatile("str %0":"=r" (tr));
-
- return tr;
-}
-
-static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
-{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- unsigned int i;
-
- for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
- gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
-}
-
/* This intentionally ignores lm, since 32-bit apps don't have that field. */
#define LDT_empty(info) \
((info)->base_addr == 0 && \
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 85599ad4d024..5dff775af7cd 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -36,6 +36,12 @@
# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
+#ifdef CONFIG_X86_5LEVEL
+# define DISABLE_LA57 0
+#else
+# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -55,7 +61,7 @@
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
-#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE)
+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
#define DISABLED_MASK17 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9d49c18b5ea9..d4d3ed456cb7 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -293,8 +293,23 @@ do { \
} \
} while (0)
+/*
+ * True on X86_32 or when emulating IA32 on X86_64
+ */
+static inline int mmap_is_ia32(void)
+{
+ return IS_ENABLED(CONFIG_X86_32) ||
+ (IS_ENABLED(CONFIG_COMPAT) &&
+ test_thread_flag(TIF_ADDR32));
+}
+
+extern unsigned long tasksize_32bit(void);
+extern unsigned long tasksize_64bit(void);
+extern unsigned long get_mmap_base(int is_legacy);
+
#ifdef CONFIG_X86_32
+#define __STACK_RND_MASK(is32bit) (0x7ff)
#define STACK_RND_MASK (0x7ff)
#define ARCH_DLINFO ARCH_DLINFO_IA32
@@ -304,7 +319,8 @@ do { \
#else /* CONFIG_X86_32 */
/* 1GB for 64bit, 8MB for 32bit */
-#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff)
+#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff)
+#define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32())
#define ARCH_DLINFO \
do { \
@@ -348,16 +364,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
-/*
- * True on X86_32 or when emulating IA32 on X86_64
- */
-static inline int mmap_is_ia32(void)
-{
- return IS_ENABLED(CONFIG_X86_32) ||
- (IS_ENABLED(CONFIG_COMPAT) &&
- test_thread_flag(TIF_ADDR32));
-}
-
/* Do not change the values. See get_align_mask() */
enum align_flags {
ALIGN_VA_32 = BIT(0),
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 8554f960e21b..b65155cc3760 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -100,6 +100,10 @@ enum fixed_addresses {
#ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC,
#endif
+ /* Fixmap entries to remap the GDTs, one per processor. */
+ FIX_GDT_REMAP_BEGIN,
+ FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
+
__end_of_permanent_fixed_addresses,
/*
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 1410b567ecde..f527b02a0ee3 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -11,9 +11,12 @@
* 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
*/
#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \
- (0xffff800000000000ULL >> 3))
-/* 47 bits for kernel address -> (47 - 3) bits for shadow */
-#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3)))
+ ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3))
+/*
+ * 47 bits for kernel address -> (47 - 3) bits for shadow
+ * 56 bits for kernel address -> (56 - 3) bits for shadow
+ */
+#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (__VIRTUAL_MASK_SHIFT - 3)))
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 282630e4c6ea..70ef205489f0 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -164,6 +164,7 @@ struct kimage_arch {
};
#else
struct kimage_arch {
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index e63873683d4a..4fd5195deed0 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -128,7 +128,7 @@
* debugging tools. Each entry is only valid when its finished flag
* is set.
*/
-struct mce_log {
+struct mce_log_buffer {
char signature[12]; /* "MACHINECHECK" */
unsigned len; /* = MCE_LOG_LEN */
unsigned next;
@@ -191,10 +191,12 @@ extern struct mca_config mca_cfg;
extern struct mca_msr_regs msr_ops;
enum mce_notifier_prios {
- MCE_PRIO_SRAO = INT_MAX,
- MCE_PRIO_EXTLOG = INT_MAX - 1,
- MCE_PRIO_NFIT = INT_MAX - 2,
- MCE_PRIO_EDAC = INT_MAX - 3,
+ MCE_PRIO_FIRST = INT_MAX,
+ MCE_PRIO_SRAO = INT_MAX - 1,
+ MCE_PRIO_EXTLOG = INT_MAX - 2,
+ MCE_PRIO_NFIT = INT_MAX - 3,
+ MCE_PRIO_EDAC = INT_MAX - 4,
+ MCE_PRIO_MCELOG = 1,
MCE_PRIO_LOWEST = 0,
};
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 306c7e12af55..6e933d2d88d9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -220,18 +220,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
}
#endif
-static inline bool __pkru_allows_pkey(u16 pkey, bool write)
-{
- u32 pkru = read_pkru();
-
- if (!__pkru_allows_read(pkru, pkey))
- return false;
- if (write && !__pkru_allows_write(pkru, pkey))
- return false;
-
- return true;
-}
-
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
@@ -268,8 +256,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
-static inline bool arch_pte_access_permitted(pte_t pte, bool write)
-{
- return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
-}
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 7c9c895432a9..fba100713924 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -176,4 +176,58 @@ void hyperv_report_panic(struct pt_regs *regs);
bool hv_is_hypercall_page_setup(void);
void hyperv_cleanup(void);
#endif
+#ifdef CONFIG_HYPERV_TSCPAGE
+struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
+static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
+{
+ u64 scale, offset, cur_tsc;
+ u32 sequence;
+
+ /*
+ * The protocol for reading Hyper-V TSC page is specified in Hypervisor
+ * Top-Level Functional Specification ver. 3.0 and above. To get the
+ * reference time we must do the following:
+ * - READ ReferenceTscSequence
+ * A special '0' value indicates the time source is unreliable and we
+ * need to use something else. The currently published specification
+ * versions (up to 4.0b) contain a mistake and wrongly claim '-1'
+ * instead of '0' as the special value, see commit c35b82ef0294.
+ * - ReferenceTime =
+ * ((RDTSC() * ReferenceTscScale) >> 64) + ReferenceTscOffset
+ * - READ ReferenceTscSequence again. In case its value has changed
+ * since our first reading we need to discard ReferenceTime and repeat
+ * the whole sequence as the hypervisor was updating the page in
+ * between.
+ */
+ do {
+ sequence = READ_ONCE(tsc_pg->tsc_sequence);
+ if (!sequence)
+ return U64_MAX;
+ /*
+ * Make sure we read sequence before we read other values from
+ * TSC page.
+ */
+ smp_rmb();
+
+ scale = READ_ONCE(tsc_pg->tsc_scale);
+ offset = READ_ONCE(tsc_pg->tsc_offset);
+ cur_tsc = rdtsc_ordered();
+
+ /*
+ * Make sure we read sequence after we read all other values
+ * from TSC page.
+ */
+ smp_rmb();
+
+ } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence);
+
+ return mul_u64_u64_shr(cur_tsc, scale, 64) + offset;
+}
+
+#else
+static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
+{
+ return NULL;
+}
+#endif
#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d8b5f8ab8ef9..673f9ac50f6d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -45,6 +45,8 @@
#define MSR_IA32_PERFCTR1 0x000000c2
#define MSR_FSB_FREQ 0x000000cd
#define MSR_PLATFORM_INFO 0x000000ce
+#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31
+#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT)
#define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2
#define NHM_C3_AUTO_DEMOTE (1UL << 25)
@@ -127,6 +129,7 @@
/* DEBUGCTLMSR bits (others vary by model): */
#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */
+#define DEBUGCTLMSR_BTF_SHIFT 1
#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */
#define DEBUGCTLMSR_TR (1UL << 6)
#define DEBUGCTLMSR_BTS (1UL << 7)
@@ -552,10 +555,12 @@
#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT 39
#define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE_BIT)
-/* MISC_FEATURE_ENABLES non-architectural features */
-#define MSR_MISC_FEATURE_ENABLES 0x00000140
+/* MISC_FEATURES_ENABLES non-architectural features */
+#define MSR_MISC_FEATURES_ENABLES 0x00000140
-#define MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT 1
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT 0
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT)
+#define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT 1
#define MSR_IA32_TSC_DEADLINE 0x000006E0
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index b3bebf9e5746..b4a0d43248cf 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -4,6 +4,7 @@
#include <asm/page_64_types.h>
#ifndef __ASSEMBLY__
+#include <asm/alternative.h>
/* duplicated to the one in bootmem.h */
extern unsigned long max_pfn;
@@ -34,7 +35,20 @@ extern unsigned long __phys_addr_symbol(unsigned long);
#define pfn_valid(pfn) ((pfn) < max_pfn)
#endif
-void clear_page(void *page);
+void clear_page_orig(void *page);
+void clear_page_rep(void *page);
+void clear_page_erms(void *page);
+
+static inline void clear_page(void *page)
+{
+ alternative_call_2(clear_page_orig,
+ clear_page_rep, X86_FEATURE_REP_GOOD,
+ clear_page_erms, X86_FEATURE_ERMS,
+ "=D" (page),
+ "0" (page)
+ : "memory", "rax", "rcx");
+}
+
void copy_page(void *to, void *from);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 9215e0527647..3f5f08b010d0 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -36,7 +36,12 @@
* hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
* what Xen requires.
*/
+#ifdef CONFIG_X86_5LEVEL
+#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL)
+#else
#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL)
+#endif
+
#ifdef CONFIG_RANDOMIZE_MEMORY
#define __PAGE_OFFSET page_offset_base
#else
@@ -46,8 +51,13 @@
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
+#ifdef CONFIG_X86_5LEVEL
+#define __PHYSICAL_MASK_SHIFT 52
+#define __VIRTUAL_MASK_SHIFT 56
+#else
#define __PHYSICAL_MASK_SHIFT 46
#define __VIRTUAL_MASK_SHIFT 47
+#endif
/*
* Kernel image size is limited to 1GiB due to the fixmap living in the
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0489884fdc44..55fa56fe4e45 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -357,6 +357,16 @@ static inline void paravirt_release_pud(unsigned long pfn)
PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
}
+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn);
+}
+
+static inline void paravirt_release_p4d(unsigned long pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn);
+}
+
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
@@ -536,7 +546,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
val);
}
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
@@ -565,26 +575,54 @@ static inline pudval_t pud_val(pud_t pud)
return ret;
}
-static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+static inline void pud_clear(pud_t *pudp)
{
- pgdval_t val = native_pgd_val(pgd);
+ set_pud(pudp, __pud(0));
+}
- if (sizeof(pgdval_t) > sizeof(long))
- PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp,
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+ p4dval_t val = native_p4d_val(p4d);
+
+ if (sizeof(p4dval_t) > sizeof(long))
+ PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp,
val, (u64)val >> 32);
else
- PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp,
+ PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp,
val);
}
+#if CONFIG_PGTABLE_LEVELS >= 5
+
+static inline p4d_t __p4d(p4dval_t val)
+{
+ p4dval_t ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d, val);
+
+ return (p4d_t) { ret };
+}
+
+static inline p4dval_t p4d_val(p4d_t p4d)
+{
+ return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d);
+}
+
+static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ pgdval_t val = native_pgd_val(pgd);
+
+ PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val);
+}
+
static inline void pgd_clear(pgd_t *pgdp)
{
set_pgd(pgdp, __pgd(0));
}
-static inline void pud_clear(pud_t *pudp)
+#endif /* CONFIG_PGTABLE_LEVELS == 5 */
+
+static inline void p4d_clear(p4d_t *p4dp)
{
- set_pud(pudp, __pud(0));
+ set_p4d(p4dp, __p4d(0));
}
#endif /* CONFIG_PGTABLE_LEVELS == 4 */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b060f962d581..7465d6fe336f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -238,9 +238,11 @@ struct pv_mmu_ops {
void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
+ void (*alloc_p4d)(struct mm_struct *mm, unsigned long pfn);
void (*release_pte)(unsigned long pfn);
void (*release_pmd)(unsigned long pfn);
void (*release_pud)(unsigned long pfn);
+ void (*release_p4d)(unsigned long pfn);
/* Pagetable manipulation functions */
void (*set_pte)(pte_t *ptep, pte_t pteval);
@@ -279,12 +281,21 @@ struct pv_mmu_ops {
struct paravirt_callee_save pmd_val;
struct paravirt_callee_save make_pmd;
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
struct paravirt_callee_save pud_val;
struct paravirt_callee_save make_pud;
- void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
-#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+ void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval);
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+ struct paravirt_callee_save p4d_val;
+ struct paravirt_callee_save make_p4d;
+
+ void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval);
+#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
+
+#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
+
#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
struct pv_lazy_ops lazy_mode;
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index b6d425999f99..b2d0cd8288aa 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -17,9 +17,11 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {
static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
unsigned long start, unsigned long count) {}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {}
static inline void paravirt_release_pte(unsigned long pfn) {}
static inline void paravirt_release_pmd(unsigned long pfn) {}
static inline void paravirt_release_pud(unsigned long pfn) {}
+static inline void paravirt_release_p4d(unsigned long pfn) {}
#endif
/*
@@ -121,10 +123,10 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
#endif /* CONFIG_X86_PAE */
#if CONFIG_PGTABLE_LEVELS > 3
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
- set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+ set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -150,6 +152,37 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
___pud_free_tlb(tlb, pud);
}
+#if CONFIG_PGTABLE_LEVELS > 4
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
+{
+ paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
+}
+
+static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT;
+
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
+ return (p4d_t *)get_zeroed_page(gfp);
+}
+
+static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
+{
+ BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
+ free_page((unsigned long)p4d);
+}
+
+extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
+
+static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
+ unsigned long address)
+{
+ ___p4d_free_tlb(tlb, p4d);
+}
+
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index 392576433e77..373ab1de909f 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -7,6 +7,7 @@
typedef unsigned long pteval_t;
typedef unsigned long pmdval_t;
typedef unsigned long pudval_t;
+typedef unsigned long p4dval_t;
typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 50d35e3185f5..c8821bab938f 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
+#define gup_get_pte gup_get_pte
+/*
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking
+ * any locks. For this we would like to load the pointers atomically,
+ * but that is not possible (without expensive cmpxchg8b) on PAE. What
+ * we do have is the guarantee that a PTE will only either go from not
+ * present to present, or present to not present or both -- it will not
+ * switch to a completely different present page without a TLB flush in
+ * between; something that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ * ptep->pte_high = h;
+ * smp_wmb();
+ * ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ * ptep->pte_low = 0;
+ * smp_wmb();
+ * ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' iff pte_high
+ * sees 'h'. We load pte_high *after* loading pte_low, which ensures we
+ * don't see an older value of pte_high. *Then* we recheck pte_low,
+ * which ensures that we haven't picked up a changed pte high. We might
+ * have gotten rubbish values from pte_low and pte_high, but we are
+ * guaranteed that pte_low will not have the present bit set *unless*
+ * it is 'l'. Because get_user_pages_fast() only operates on present ptes
+ * we're safe.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ pte_t pte;
+
+ do {
+ pte.pte_low = ptep->pte_low;
+ smp_rmb();
+ pte.pte_high = ptep->pte_high;
+ smp_rmb();
+ } while (unlikely(pte.pte_low != ptep->pte_low));
+
+ return pte;
+}
+
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index bcc89625ebe5..b8a4341faafa 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -7,6 +7,7 @@
typedef u64 pteval_t;
typedef u64 pmdval_t;
typedef u64 pudval_t;
+typedef u64 p4dval_t;
typedef u64 pgdval_t;
typedef u64 pgprotval_t;
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 585ee0d42d18..473293a681e0 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -53,11 +53,19 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
-#ifndef __PAGETABLE_PUD_FOLDED
+#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
#define pgd_clear(pgd) native_pgd_clear(pgd)
#endif
+#ifndef set_p4d
+# define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d)
+#endif
+
+#ifndef __PAGETABLE_PUD_FOLDED
+#define p4d_clear(p4d) native_p4d_clear(p4d)
+#endif
+
#ifndef set_pud
# define set_pud(pudp, pud) native_set_pud(pudp, pud)
#endif
@@ -74,6 +82,11 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
+#ifndef __PAGETABLE_P4D_FOLDED
+#define p4d_val(x) native_p4d_val(x)
+#define __p4d(x) native_make_p4d(x)
+#endif
+
#ifndef __PAGETABLE_PUD_FOLDED
#define pud_val(x) native_pud_val(x)
#define __pud(x) native_make_pud(x)
@@ -179,6 +192,17 @@ static inline unsigned long pud_pfn(pud_t pud)
return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}
+static inline unsigned long p4d_pfn(p4d_t p4d)
+{
+ return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
+}
+
+static inline int p4d_large(p4d_t p4d)
+{
+ /* No 512 GiB pages yet */
+ return 0;
+}
+
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
static inline int pmd_large(pmd_t pte)
@@ -222,6 +246,11 @@ static inline int pud_devmap(pud_t pud)
return 0;
}
#endif
+
+static inline int pgd_devmap(pgd_t pgd)
+{
+ return 0;
+}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -538,6 +567,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
#define pte_pgprot(x) __pgprot(pte_flags(x))
#define pmd_pgprot(x) __pgprot(pmd_flags(x))
#define pud_pgprot(x) __pgprot(pud_flags(x))
+#define p4d_pgprot(x) __pgprot(p4d_flags(x))
#define canon_pgprot(p) __pgprot(massage_pgprot(p))
@@ -587,6 +617,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/log2.h>
+#include <asm/fixmap.h>
static inline int pte_none(pte_t pte)
{
@@ -770,7 +801,52 @@ static inline int pud_large(pud_t pud)
}
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
+static inline unsigned long pud_index(unsigned long address)
+{
+ return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+}
+
#if CONFIG_PGTABLE_LEVELS > 3
+static inline int p4d_none(p4d_t p4d)
+{
+ return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
+}
+
+static inline int p4d_present(p4d_t p4d)
+{
+ return p4d_flags(p4d) & _PAGE_PRESENT;
+}
+
+static inline unsigned long p4d_page_vaddr(p4d_t p4d)
+{
+ return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
+}
+
+/*
+ * Currently stuck as a macro due to indirect forward reference to
+ * linux/mmzone.h's __section_mem_map_addr() definition:
+ */
+#define p4d_page(p4d) \
+ pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT)
+
+/* Find an entry in the third-level page table.. */
+static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+{
+ return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
+}
+
+static inline int p4d_bad(p4d_t p4d)
+{
+ return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+
+static inline unsigned long p4d_index(unsigned long address)
+{
+ return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
return pgd_flags(pgd) & _PAGE_PRESENT;
@@ -788,14 +864,9 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
/* to find an entry in a page-table-directory. */
-static inline unsigned long pud_index(unsigned long address)
+static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
- return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
-}
-
-static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
-{
- return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
+ return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}
static inline int pgd_bad(pgd_t pgd)
@@ -813,7 +884,7 @@ static inline int pgd_none(pgd_t pgd)
*/
return !native_pgd_val(pgd);
}
-#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* __ASSEMBLY__ */
@@ -1120,6 +1191,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
#endif
}
+static inline bool __pkru_allows_pkey(u16 pkey, bool write)
+{
+ u32 pkru = read_pkru();
+
+ if (!__pkru_allows_read(pkru, pkey))
+ return false;
+ if (write && !__pkru_allows_write(pkru, pkey))
+ return false;
+
+ return true;
+}
+
+/*
+ * 'pteval' can come from a PTE, PMD or PUD. We only check
+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
+ * same value on all 3 types.
+ */
+static inline bool __pte_access_permitted(unsigned long pteval, bool write)
+{
+ unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+
+ if (write)
+ need_pte_bits |= _PAGE_RW;
+
+ if ((pteval & need_pte_bits) != need_pte_bits)
+ return 0;
+
+ return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
+}
+
+#define pte_access_permitted pte_access_permitted
+static inline bool pte_access_permitted(pte_t pte, bool write)
+{
+ return __pte_access_permitted(pte_val(pte), write);
+}
+
+#define pmd_access_permitted pmd_access_permitted
+static inline bool pmd_access_permitted(pmd_t pmd, bool write)
+{
+ return __pte_access_permitted(pmd_val(pmd), write);
+}
+
+#define pud_access_permitted pud_access_permitted
+static inline bool pud_access_permitted(pud_t pud, bool write)
+{
+ return __pte_access_permitted(pud_val(pud), write);
+}
+
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index fbc73360aea0..bfab55675c16 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -14,7 +14,6 @@
*/
#ifndef __ASSEMBLY__
#include <asm/processor.h>
-#include <asm/fixmap.h>
#include <linux/threads.h>
#include <asm/paravirt.h>
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 73c7ccc38912..12ea31274eb6 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -35,15 +35,22 @@ extern void paging_init(void);
#define pud_ERROR(e) \
pr_err("%s:%d: bad pud %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pud_val(e))
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+#define p4d_ERROR(e) \
+ pr_err("%s:%d: bad p4d %p(%016lx)\n", \
+ __FILE__, __LINE__, &(e), p4d_val(e))
+#endif
+
#define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
struct mm_struct;
+void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
-
static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
@@ -121,6 +128,20 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
#endif
}
+static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+ *p4dp = p4d;
+}
+
+static inline void native_p4d_clear(p4d_t *p4d)
+{
+#ifdef CONFIG_X86_5LEVEL
+ native_set_p4d(p4d, native_make_p4d(0));
+#else
+ native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)});
+#endif
+}
+
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
*pgdp = pgd;
@@ -206,6 +227,20 @@ extern void cleanup_highmap(void);
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
-#endif /* !__ASSEMBLY__ */
+#define gup_fast_permitted gup_fast_permitted
+static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
+ int write)
+{
+ unsigned long len, end;
+
+ len = (unsigned long)nr_pages << PAGE_SHIFT;
+ end = start + len;
+ if (end < start)
+ return false;
+ if (end >> __VIRTUAL_MASK_SHIFT)
+ return false;
+ return true;
+}
+#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 3a264200c62f..06470da156ba 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -13,6 +13,7 @@
typedef unsigned long pteval_t;
typedef unsigned long pmdval_t;
typedef unsigned long pudval_t;
+typedef unsigned long p4dval_t;
typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;
@@ -22,12 +23,32 @@ typedef struct { pteval_t pte; } pte_t;
#define SHARED_KERNEL_PMD 0
+#ifdef CONFIG_X86_5LEVEL
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT 48
+#define PTRS_PER_PGD 512
+
+/*
+ * 4th level page in 5-level paging case
+ */
+#define P4D_SHIFT 39
+#define PTRS_PER_P4D 512
+#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
+#define P4D_MASK (~(P4D_SIZE - 1))
+
+#else /* CONFIG_X86_5LEVEL */
+
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
#define PGDIR_SHIFT 39
#define PTRS_PER_PGD 512
+#endif /* CONFIG_X86_5LEVEL */
+
/*
* 3rd level page
*/
@@ -55,9 +76,15 @@ typedef struct { pteval_t pte; } pte_t;
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#ifdef CONFIG_X86_5LEVEL
+#define VMALLOC_SIZE_TB _AC(16384, UL)
+#define __VMALLOC_BASE _AC(0xff92000000000000, UL)
+#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
+#else
#define VMALLOC_SIZE_TB _AC(32, UL)
#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
+#endif
#ifdef CONFIG_RANDOMIZE_MEMORY
#define VMALLOC_START vmalloc_base
#define VMEMMAP_START vmemmap_base
@@ -67,10 +94,11 @@ typedef struct { pteval_t pte; } pte_t;
#endif /* CONFIG_RANDOMIZE_MEMORY */
#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
-#define MODULES_END _AC(0xffffffffff000000, UL)
+/* The module sections ends with the start of the fixmap */
+#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
-#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
+#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 62484333673d..bf9638e1ee42 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -272,9 +272,28 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}
-#if CONFIG_PGTABLE_LEVELS > 3
-#include <asm-generic/5level-fixup.h>
+#if CONFIG_PGTABLE_LEVELS > 4
+typedef struct { p4dval_t p4d; } p4d_t;
+
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+ return (p4d_t) { val };
+}
+
+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+ return p4d.p4d;
+}
+#else
+#include <asm-generic/pgtable-nop4d.h>
+
+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+ return native_pgd_val(p4d.pgd);
+}
+#endif
+#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;
static inline pud_t native_make_pud(pmdval_t val)
@@ -287,12 +306,11 @@ static inline pudval_t native_pud_val(pud_t pud)
return pud.pud;
}
#else
-#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopud.h>
static inline pudval_t native_pud_val(pud_t pud)
{
- return native_pgd_val(pud.pgd);
+ return native_pgd_val(pud.p4d.pgd);
}
#endif
@@ -309,15 +327,30 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
return pmd.pmd;
}
#else
-#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopmd.h>
static inline pmdval_t native_pmd_val(pmd_t pmd)
{
- return native_pgd_val(pmd.pud.pgd);
+ return native_pgd_val(pmd.pud.p4d.pgd);
}
#endif
+static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
+{
+ /* No 512 GiB huge pages yet */
+ return PTE_PFN_MASK;
+}
+
+static inline p4dval_t p4d_flags_mask(p4d_t p4d)
+{
+ return ~p4d_pfn_mask(p4d);
+}
+
+static inline p4dval_t p4d_flags(p4d_t p4d)
+{
+ return native_p4d_val(p4d) & p4d_flags_mask(p4d);
+}
+
static inline pudval_t pud_pfn_mask(pud_t pud)
{
if (native_pud_val(pud) & _PAGE_PSE)
@@ -461,6 +494,7 @@ enum pg_level {
PG_LEVEL_4K,
PG_LEVEL_2M,
PG_LEVEL_1G,
+ PG_LEVEL_512G,
PG_LEVEL_NUM
};
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca5407a..3cada998a402 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -80,7 +80,7 @@ extern u16 __read_mostly tlb_lld_1g[NR_INFO];
/*
* CPU type and hardware bug flags. Kept separately for each CPU.
- * Members of this structure are referenced in head.S, so think twice
+ * Members of this structure are referenced in head_32.S, so think twice
* before touching them. [mj]
*/
@@ -89,14 +89,7 @@ struct cpuinfo_x86 {
__u8 x86_vendor; /* CPU vendor */
__u8 x86_model;
__u8 x86_mask;
-#ifdef CONFIG_X86_32
- char wp_works_ok; /* It doesn't on 386's */
-
- /* Problems on some 486Dx4's and old 386's: */
- char rfu;
- char pad0;
- char pad1;
-#else
+#ifdef CONFIG_X86_64
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
int x86_tlbsize;
#endif
@@ -716,6 +709,8 @@ extern struct desc_ptr early_gdt_descr;
extern void cpu_set_gdt(int);
extern void switch_to_new_gdt(int);
+extern void load_direct_gdt(int);
+extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);
@@ -797,6 +792,7 @@ static inline void spin_lock_prefetch(const void *x)
/*
* User space process size: 3GB (default).
*/
+#define IA32_PAGE_OFFSET PAGE_OFFSET
#define TASK_SIZE PAGE_OFFSET
#define TASK_SIZE_MAX TASK_SIZE
#define STACK_TOP TASK_SIZE
@@ -873,7 +869,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
-#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3))
+#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE)
#define KSTK_EIP(task) (task_pt_regs(task)->ip)
@@ -884,6 +881,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
extern int get_tsc_mode(unsigned long adr);
extern int set_tsc_mode(unsigned int val);
+DECLARE_PER_CPU(u64, msr_misc_features_shadow);
+
/* Register/unregister a process' MPX related resource */
#define MPX_ENABLE_MANAGEMENT() mpx_enable_management()
#define MPX_DISABLE_MANAGEMENT() mpx_disable_management()
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 9b9b30b19441..8d3964fc5f91 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -9,6 +9,7 @@ void syscall_init(void);
#ifdef CONFIG_X86_64
void entry_SYSCALL_64(void);
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
#endif
#ifdef CONFIG_X86_32
@@ -30,6 +31,7 @@ void x86_report_nx(void);
extern int reboot_force;
-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
+long do_arch_prctl_common(struct task_struct *task, int option,
+ unsigned long cpuid_enabled);
#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 2cb1cc253d51..fc62ba8dce93 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -15,6 +15,7 @@ struct machine_ops {
};
extern struct machine_ops machine_ops;
+extern int crashing_cpu;
void native_machine_crash_shutdown(struct pt_regs *regs);
void native_machine_shutdown(void);
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index fac9a5c0abe9..d91ba04dd007 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -53,6 +53,12 @@
# define NEED_MOVBE 0
#endif
+#ifdef CONFIG_X86_5LEVEL
+# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
+#else
+# define NEED_LA57 0
+#endif
+
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT
/* Paravirtualized systems may not have PSE or PGE available */
@@ -98,7 +104,7 @@
#define REQUIRED_MASK13 0
#define REQUIRED_MASK14 0
#define REQUIRED_MASK15 0
-#define REQUIRED_MASK16 0
+#define REQUIRED_MASK16 (NEED_LA57)
#define REQUIRED_MASK17 0
#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 4517d6b93188..1f5bee2c202f 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -26,8 +26,13 @@
# endif
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
-# define MAX_PHYSADDR_BITS 44
-# define MAX_PHYSMEM_BITS 46
+# ifdef CONFIG_X86_5LEVEL
+# define MAX_PHYSADDR_BITS 52
+# define MAX_PHYSMEM_BITS 52
+# else
+# define MAX_PHYSADDR_BITS 44
+# define MAX_PHYSMEM_BITS 46
+# endif
#endif
#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 58505f01962f..dcbd9bcce714 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -87,7 +87,7 @@ static inline void setup_stack_canary_segment(int cpu)
{
#ifdef CONFIG_X86_32
unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
- struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
+ struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu);
struct desc_struct desc;
desc = gdt_table[GDT_ENTRY_STACK_CANARY];
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 920ca1f7adea..f765a49103fb 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -87,6 +87,7 @@ struct thread_info {
#define TIF_SECCOMP 8 /* secure computing */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
+#define TIF_NOCPUID 15 /* CPUID is not accessible in userland */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* IA32 compatibility process */
#define TIF_NOHZ 19 /* in adaptive nohz mode */
@@ -110,6 +111,7 @@ struct thread_info {
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
+#define _TIF_NOCPUID (1 << TIF_NOCPUID)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
#define _TIF_NOHZ (1 << TIF_NOHZ)
@@ -138,7 +140,7 @@ struct thread_info {
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
- (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
+ (_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
@@ -240,6 +242,8 @@ static inline int arch_within_stack_frames(const void * const stack,
extern void arch_task_cache_init(void);
extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
extern void arch_release_task_struct(struct task_struct *tsk);
+extern void arch_setup_new_exec(void);
+#define arch_setup_new_exec arch_setup_new_exec
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index fc5abff9b7fd..75d002bdb3f3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -110,6 +110,16 @@ static inline void cr4_clear_bits(unsigned long mask)
}
}
+static inline void cr4_toggle_bits(unsigned long mask)
+{
+ unsigned long cr4;
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ cr4 ^= mask;
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ __write_cr4(cr4);
+}
+
/* Read the CR4 shadow. */
static inline unsigned long cr4_read_shadow(void)
{
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 57ab86d94d64..7cac79802ad2 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -185,6 +185,15 @@
#define MSG_REGULAR 1
#define MSG_RETRY 2
+#define BAU_DESC_QUALIFIER 0x534749
+
+enum uv_bau_version {
+ UV_BAU_V1 = 1,
+ UV_BAU_V2,
+ UV_BAU_V3,
+ UV_BAU_V4,
+};
+
/*
* Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
* If the 'multilevel' flag in the header portion of the descriptor
@@ -222,20 +231,32 @@ struct bau_local_cpumask {
* the s/w ack bit vector ]
*/
-/*
- * The payload is software-defined for INTD transactions
+/**
+ * struct uv1_2_3_bau_msg_payload - defines payload for INTD transactions
+ * @address: Signifies a page or all TLB's of the cpu
+ * @sending_cpu: CPU from which the message originates
+ * @acknowledge_count: CPUs on the destination Hub that received the interrupt
*/
-struct bau_msg_payload {
- unsigned long address; /* signifies a page or all
- TLB's of the cpu */
- /* 64 bits */
- unsigned short sending_cpu; /* filled in by sender */
- /* 16 bits */
- unsigned short acknowledge_count; /* filled in by destination */
- /* 16 bits */
- unsigned int reserved1:32; /* not usable */
+struct uv1_2_3_bau_msg_payload {
+ u64 address;
+ u16 sending_cpu;
+ u16 acknowledge_count;
};
+/**
+ * struct uv4_bau_msg_payload - defines payload for INTD transactions
+ * @address: Signifies a page or all TLB's of the cpu
+ * @sending_cpu: CPU from which the message originates
+ * @acknowledge_count: CPUs on the destination Hub that received the interrupt
+ * @qualifier: Set by source to verify origin of INTD broadcast
+ */
+struct uv4_bau_msg_payload {
+ u64 address;
+ u16 sending_cpu;
+ u16 acknowledge_count;
+ u32 reserved:8;
+ u32 qualifier:24;
+};
/*
* UV1 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
@@ -385,17 +406,6 @@ struct uv2_3_bau_msg_header {
/* bits 127:120 */
};
-/* Abstracted BAU functions */
-struct bau_operations {
- unsigned long (*read_l_sw_ack)(void);
- unsigned long (*read_g_sw_ack)(int pnode);
- unsigned long (*bau_gpa_to_offset)(unsigned long vaddr);
- void (*write_l_sw_ack)(unsigned long mmr);
- void (*write_g_sw_ack)(int pnode, unsigned long mmr);
- void (*write_payload_first)(int pnode, unsigned long mmr);
- void (*write_payload_last)(int pnode, unsigned long mmr);
-};
-
/*
* The activation descriptor:
* The format of the message to send, plus all accompanying control
@@ -411,7 +421,10 @@ struct bau_desc {
struct uv2_3_bau_msg_header uv2_3_hdr;
} header;
- struct bau_msg_payload payload;
+ union bau_payload_header {
+ struct uv1_2_3_bau_msg_payload uv1_2_3;
+ struct uv4_bau_msg_payload uv4;
+ } payload;
};
/* UV1:
* -payload-- ---------header------
@@ -588,8 +601,12 @@ struct uvhub_desc {
struct socket_desc socket[2];
};
-/*
- * one per-cpu; to locate the software tables
+/**
+ * struct bau_control
+ * @status_mmr: location of status mmr, determined by uvhub_cpu
+ * @status_index: index of ERR|BUSY bits in status mmr, determined by uvhub_cpu
+ *
+ * Per-cpu control struct containing CPU topology information and BAU tuneables.
*/
struct bau_control {
struct bau_desc *descriptor_base;
@@ -607,6 +624,8 @@ struct bau_control {
int timeout_tries;
int ipi_attempts;
int conseccompletes;
+ u64 status_mmr;
+ int status_index;
bool nobau;
short baudisabled;
short cpu;
@@ -644,6 +663,19 @@ struct bau_control {
struct hub_and_pnode *thp;
};
+/* Abstracted BAU functions */
+struct bau_operations {
+ unsigned long (*read_l_sw_ack)(void);
+ unsigned long (*read_g_sw_ack)(int pnode);
+ unsigned long (*bau_gpa_to_offset)(unsigned long vaddr);
+ void (*write_l_sw_ack)(unsigned long mmr);
+ void (*write_g_sw_ack)(int pnode, unsigned long mmr);
+ void (*write_payload_first)(int pnode, unsigned long mmr);
+ void (*write_payload_last)(int pnode, unsigned long mmr);
+ int (*wait_completion)(struct bau_desc*,
+ struct bau_control*, long try);
+};
+
static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
{
write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 2444189cbe28..bccdf4938ddf 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -20,6 +20,7 @@ struct vdso_image {
long sym_vvar_page;
long sym_hpet_page;
long sym_pvclock_page;
+ long sym_hvclock_page;
long sym_VDSO32_NOTE_MASK;
long sym___kernel_sigreturn;
long sym___kernel_rt_sigreturn;
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 33cbd3db97b9..bf2ca56fba11 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -279,13 +279,17 @@ static inline pte_t __pte_ma(pteval_t x)
#define pmd_val_ma(v) ((v).pmd)
#ifdef __PAGETABLE_PUD_FOLDED
-#define pud_val_ma(v) ((v).pgd.pgd)
+#define pud_val_ma(v) ((v).p4d.pgd.pgd)
#else
#define pud_val_ma(v) ((v).pud)
#endif
#define __pmd_ma(x) ((pmd_t) { (x) } )
-#define pgd_val_ma(x) ((x).pgd)
+#ifdef __PAGETABLE_P4D_FOLDED
+#define p4d_val_ma(x) ((x).pgd.pgd)
+#else
+#define p4d_val_ma(x) ((x).p4d)
+#endif
void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 835aa51c7f6e..c45765517092 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -1,10 +1,13 @@
#ifndef _ASM_X86_PRCTL_H
#define _ASM_X86_PRCTL_H
-#define ARCH_SET_GS 0x1001
-#define ARCH_SET_FS 0x1002
-#define ARCH_GET_FS 0x1003
-#define ARCH_GET_GS 0x1004
+#define ARCH_SET_GS 0x1001
+#define ARCH_SET_FS 0x1002
+#define ARCH_GET_FS 0x1003
+#define ARCH_GET_GS 0x1004
+
+#define ARCH_GET_CPUID 0x1011
+#define ARCH_SET_CPUID 0x1012
#define ARCH_MAP_VDSO_X32 0x2001
#define ARCH_MAP_VDSO_32 0x2002
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 84c00592d359..4b994232cb57 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -27,7 +27,7 @@ KASAN_SANITIZE_stacktrace.o := n
OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
-OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
# If instrumentation of this dir is enabled, boot hangs during first second.
@@ -46,7 +46,7 @@ obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
-obj-$(CONFIG_X86_64) += sys_x86_64.o mcount_64.o
+obj-$(CONFIG_X86_64) += sys_x86_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
@@ -82,6 +82,7 @@ obj-y += apic/
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_LIVEPATCH) += livepatch.o
+obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b2879cc23db4..70854988a963 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1564,12 +1564,6 @@ int __init early_acpi_boot_init(void)
return 0;
}
-static int __init acpi_parse_bgrt(struct acpi_table_header *table)
-{
- efi_bgrt_init(table);
- return 0;
-}
-
int __init acpi_boot_init(void)
{
/* those are executed after early-quirks are executed */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 48587335ede8..ed014814ea35 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,7 +101,7 @@ int x86_acpi_suspend_lowlevel(void)
#ifdef CONFIG_SMP
initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
- (unsigned long)get_cpu_gdt_table(smp_processor_id());
+ (unsigned long)get_cpu_gdt_rw(smp_processor_id());
initial_gs = per_cpu_offset(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8ccb7ef512e0..c895258bf9ad 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1789,8 +1789,8 @@ void __init init_apic_mappings(void)
apic_phys = mp_lapic_addr;
/*
- * acpi lapic path already maps that address in
- * acpi_register_lapic_address()
+ * If the system has ACPI MADT tables or MP info, the LAPIC
+ * address is already registered.
*/
if (!acpi_lapic && !smp_found_config)
register_lapic_address(apic_phys);
@@ -2237,7 +2237,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
static void __init apic_bsp_up_setup(void)
{
#ifdef CONFIG_X86_64
- apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
+ apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid));
#else
/*
* Hack: In case of kdump, after a crash, kernel might be booting
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5a414545e8a3..446b0d3d4932 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call)
cpu = get_cpu();
BUG_ON(cpu != 0);
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;
@@ -685,7 +685,7 @@ static long __apm_bios_call_simple(void *_call)
cpu = get_cpu();
BUG_ON(cpu != 0);
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;
@@ -2352,7 +2352,7 @@ static int __init apm_init(void)
* Note we only set APM segments on CPU zero, since we pin the APM
* code to that CPU.
*/
- gdt = get_cpu_gdt_table(0);
+ gdt = get_cpu_gdt_rw(0);
set_desc_base(&gdt[APM_CS >> 3],
(unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
set_desc_base(&gdt[APM_CS_16 >> 3],
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 58094a1f9e9d..8ee32119144d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -448,19 +448,60 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}
+/* Setup the fixmap mapping only once per-processor */
+static inline void setup_fixmap_gdt(int cpu)
+{
+#ifdef CONFIG_X86_64
+ /* On 64-bit systems, we use a read-only fixmap GDT. */
+ pgprot_t prot = PAGE_KERNEL_RO;
+#else
+ /*
+ * On native 32-bit systems, the GDT cannot be read-only because
+ * our double fault handler uses a task gate, and entering through
+ * a task gate needs to change an available TSS to busy. If the GDT
+ * is read-only, that will triple fault.
+ *
+ * On Xen PV, the GDT must be read-only because the hypervisor requires
+ * it.
+ */
+ pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+ PAGE_KERNEL_RO : PAGE_KERNEL;
+#endif
+
+ __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
+}
+
+/* Load the original GDT from the per-cpu structure */
+void load_direct_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+EXPORT_SYMBOL_GPL(load_direct_gdt);
+
+/* Load a fixmap remapping of the per-cpu GDT */
+void load_fixmap_gdt(int cpu)
+{
+ struct desc_ptr gdt_descr;
+
+ gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
+ gdt_descr.size = GDT_SIZE - 1;
+ load_gdt(&gdt_descr);
+}
+EXPORT_SYMBOL_GPL(load_fixmap_gdt);
+
/*
* Current gdt points %fs at the "master" per-cpu area: after this,
* it's on the real one.
*/
void switch_to_new_gdt(int cpu)
{
- struct desc_ptr gdt_descr;
-
- gdt_descr.address = (long)get_cpu_gdt_table(cpu);
- gdt_descr.size = GDT_SIZE - 1;
- load_gdt(&gdt_descr);
+ /* Load the original GDT */
+ load_direct_gdt(cpu);
/* Reload the per-cpu base */
-
load_percpu_segment(cpu);
}
@@ -1526,6 +1567,9 @@ void cpu_init(void)
if (is_uv_system())
uv_cpu_init();
+
+ setup_fixmap_gdt(cpu);
+ load_fixmap_gdt(cpu);
}
#else
@@ -1581,6 +1625,9 @@ void cpu_init(void)
dbg_restore_debug_regs();
fpu__init_cpu();
+
+ setup_fixmap_gdt(cpu);
+ load_fixmap_gdt(cpu);
}
#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 063197771b8d..dfa90a3a5145 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -90,16 +90,12 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
return;
}
- if (ring3mwait_disabled) {
- msr_clear_bit(MSR_MISC_FEATURE_ENABLES,
- MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
+ if (ring3mwait_disabled)
return;
- }
-
- msr_set_bit(MSR_MISC_FEATURE_ENABLES,
- MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+ this_cpu_or(msr_misc_features_shadow,
+ 1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
if (c == &boot_cpu_data)
ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
@@ -488,6 +484,34 @@ static void intel_bsp_resume(struct cpuinfo_x86 *c)
init_intel_energy_perf(c);
}
+static void init_cpuid_fault(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) {
+ if (msr & MSR_PLATFORM_INFO_CPUID_FAULT)
+ set_cpu_cap(c, X86_FEATURE_CPUID_FAULT);
+ }
+}
+
+static void init_intel_misc_features(struct cpuinfo_x86 *c)
+{
+ u64 msr;
+
+ if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr))
+ return;
+
+ /* Clear all MISC features */
+ this_cpu_write(msr_misc_features_shadow, 0);
+
+ /* Check features and update capabilities and shadow control bits */
+ init_cpuid_fault(c);
+ probe_xeon_phi_r3mwait(c);
+
+ msr = this_cpu_read(msr_misc_features_shadow);
+ wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
+}
+
static void init_intel(struct cpuinfo_x86 *c)
{
unsigned int l2 = 0;
@@ -602,7 +626,7 @@ static void init_intel(struct cpuinfo_x86 *c)
init_intel_energy_perf(c);
- probe_xeon_phi_r3mwait(c);
+ init_intel_misc_features(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index a3311c886194..43051f0777d4 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -9,3 +9,5 @@ obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
obj-$(CONFIG_ACPI_APEI) += mce-apei.o
+
+obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o
diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
new file mode 100644
index 000000000000..9c632cb88546
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
@@ -0,0 +1,397 @@
+/*
+ * /dev/mcelog driver
+ *
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/poll.h>
+
+#include "mce-internal.h"
+
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
+
+static char mce_helper[128];
+static char *mce_helper_argv[2] = { mce_helper, NULL };
+
+#define mce_log_get_idx_check(p) \
+({ \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&mce_chrdev_read_mutex), \
+ "suspicious mce_log_get_idx_check() usage"); \
+ smp_load_acquire(&(p)); \
+})
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+static struct mce_log_buffer mcelog = {
+ .signature = MCE_LOG_SIGNATURE,
+ .len = MCE_LOG_LEN,
+ .recordlen = sizeof(struct mce),
+};
+
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
+/* User mode helper program triggered by machine check event */
+extern char mce_helper[128];
+
+static int dev_mce_log(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned int next, entry;
+
+ wmb();
+ for (;;) {
+ entry = mce_log_get_idx_check(mcelog.next);
+ for (;;) {
+
+ /*
+ * When the buffer fills up discard new entries.
+ * Assume that the earlier errors are the more
+ * interesting ones:
+ */
+ if (entry >= MCE_LOG_LEN) {
+ set_bit(MCE_OVERFLOW,
+ (unsigned long *)&mcelog.flags);
+ return NOTIFY_OK;
+ }
+ /* Old left over entry. Skip: */
+ if (mcelog.entry[entry].finished) {
+ entry++;
+ continue;
+ }
+ break;
+ }
+ smp_rmb();
+ next = entry + 1;
+ if (cmpxchg(&mcelog.next, entry, next) == entry)
+ break;
+ }
+ memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ wmb();
+ mcelog.entry[entry].finished = 1;
+ wmb();
+
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block dev_mcelog_nb = {
+ .notifier_call = dev_mce_log,
+ .priority = MCE_PRIO_MCELOG,
+};
+
+static void mce_do_trigger(struct work_struct *work)
+{
+ call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
+}
+
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
+
+void mce_work_trigger(void)
+{
+ if (mce_helper[0])
+ schedule_work(&mce_trigger_work);
+}
+
+static ssize_t
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
+{
+ strcpy(buf, mce_helper);
+ strcat(buf, "\n");
+ return strlen(mce_helper) + 1;
+}
+
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
+ const char *buf, size_t siz)
+{
+ char *p;
+
+ strncpy(mce_helper, buf, sizeof(mce_helper));
+ mce_helper[sizeof(mce_helper)-1] = 0;
+ p = strchr(mce_helper, '\n');
+
+ if (p)
+ *p = 0;
+
+ return strlen(mce_helper) + !!p;
+}
+
+DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+
+/*
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
+ */
+
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
+
+static int mce_chrdev_open(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return -EBUSY;
+ }
+
+ if (file->f_flags & O_EXCL)
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return nonseekable_open(inode, file);
+}
+
+static int mce_chrdev_release(struct inode *inode, struct file *file)
+{
+ spin_lock(&mce_chrdev_state_lock);
+
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
+
+ spin_unlock(&mce_chrdev_state_lock);
+
+ return 0;
+}
+
+static void collect_tscs(void *data)
+{
+ unsigned long *cpu_tsc = (unsigned long *)data;
+
+ cpu_tsc[smp_processor_id()] = rdtsc();
+}
+
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+ int rc;
+ u64 record_id;
+ struct mce m;
+
+ if (usize < sizeof(struct mce))
+ return -EINVAL;
+
+ rc = apei_read_mce(&m, &record_id);
+ /* Error or no more MCE record */
+ if (rc <= 0) {
+ mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
+ return rc;
+ }
+ rc = -EFAULT;
+ if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+ return rc;
+ /*
+ * In fact, we should have cleared the record after that has
+ * been flushed to the disk or sent to network in
+ * /sbin/mcelog, but we have no interface to support that now,
+ * so just clear it to avoid duplication.
+ */
+ rc = apei_clear_mce(record_id);
+ if (rc) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ *ubuf += sizeof(struct mce);
+
+ return 0;
+}
+
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ char __user *buf = ubuf;
+ unsigned long *cpu_tsc;
+ unsigned prev, next;
+ int i, err;
+
+ cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
+ if (!cpu_tsc)
+ return -ENOMEM;
+
+ mutex_lock(&mce_chrdev_read_mutex);
+
+ if (!mce_apei_read_done) {
+ err = __mce_read_apei(&buf, usize);
+ if (err || buf != ubuf)
+ goto out;
+ }
+
+ next = mce_log_get_idx_check(mcelog.next);
+
+ /* Only supports full reads right now */
+ err = -EINVAL;
+ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+ goto out;
+
+ err = 0;
+ prev = 0;
+ do {
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+ struct mce *m = &mcelog.entry[i];
+
+ while (!m->finished) {
+ if (time_after_eq(jiffies, start + 2)) {
+ memset(m, 0, sizeof(*m));
+ goto timeout;
+ }
+ cpu_relax();
+ }
+ smp_rmb();
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
+timeout:
+ ;
+ }
+
+ memset(mcelog.entry + prev, 0,
+ (next - prev) * sizeof(struct mce));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
+
+ synchronize_sched();
+
+ /*
+ * Collect entries that were still getting written before the
+ * synchronize.
+ */
+ on_each_cpu(collect_tscs, cpu_tsc, 1);
+
+ for (i = next; i < MCE_LOG_LEN; i++) {
+ struct mce *m = &mcelog.entry[i];
+
+ if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+ err |= copy_to_user(buf, m, sizeof(*m));
+ smp_rmb();
+ buf += sizeof(*m);
+ memset(m, 0, sizeof(*m));
+ }
+ }
+
+ if (err)
+ err = -EFAULT;
+
+out:
+ mutex_unlock(&mce_chrdev_read_mutex);
+ kfree(cpu_tsc);
+
+ return err ? err : buf - ubuf;
+}
+
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
+{
+ poll_wait(file, &mce_chrdev_wait, wait);
+ if (READ_ONCE(mcelog.next))
+ return POLLIN | POLLRDNORM;
+ if (!mce_apei_read_done && apei_check_mce())
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(MCE_LOG_LEN, p);
+ case MCE_GETCLEAR_FLAGS: {
+ unsigned flags;
+
+ do {
+ flags = mcelog.flags;
+ } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+
+ return put_user(flags, p);
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off);
+
+void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
+ const char __user *ubuf,
+ size_t usize, loff_t *off))
+{
+ mce_write = fn;
+}
+EXPORT_SYMBOL_GPL(register_mce_write_callback);
+
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ if (mce_write)
+ return mce_write(filp, ubuf, usize, off);
+ else
+ return -EINVAL;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .write = mce_chrdev_write,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice mce_chrdev_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+static __init int dev_mcelog_init_device(void)
+{
+ int err;
+
+ /* register character device /dev/mcelog */
+ err = misc_register(&mce_chrdev_device);
+ if (err) {
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ return err;
+ }
+ mce_register_decode_chain(&dev_mcelog_nb);
+ return 0;
+}
+device_initcall_sync(dev_mcelog_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 903043e6a62b..7f2a7c54391f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -96,3 +96,11 @@ static inline bool mce_cmp(struct mce *m1, struct mce *m2)
m1->addr != m2->addr ||
m1->misc != m2->misc;
}
+
+extern struct device_attribute dev_attr_trigger;
+
+#ifdef CONFIG_X86_MCELOG_LEGACY
+extern void mce_work_trigger(void);
+#else
+static inline void mce_work_trigger(void) { }
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5accfbdee3f0..a09bb6775c23 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -35,6 +35,7 @@
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/cpu.h>
+#include <linux/ras.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -49,20 +50,11 @@
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include <asm/reboot.h>
#include "mce-internal.h"
-static DEFINE_MUTEX(mce_chrdev_read_mutex);
-
-static int mce_chrdev_open_count; /* #times opened */
-
-#define mce_log_get_idx_check(p) \
-({ \
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
- !lockdep_is_held(&mce_chrdev_read_mutex), \
- "suspicious mce_log_get_idx_check() usage"); \
- smp_load_acquire(&(p)); \
-})
+static DEFINE_MUTEX(mce_log_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
@@ -87,15 +79,9 @@ struct mca_config mca_cfg __read_mostly = {
.monarch_timeout = -1
};
-/* User mode helper program triggered by machine check event */
-static unsigned long mce_need_notify;
-static char mce_helper[128];
-static char *mce_helper_argv[2] = { mce_helper, NULL };
-
-static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
-
static DEFINE_PER_CPU(struct mce, mces_seen);
-static int cpu_missing;
+static unsigned long mce_need_notify;
+static int cpu_missing;
/*
* MCA banks polled by the period polling timer for corrected events.
@@ -145,80 +131,36 @@ void mce_setup(struct mce *m)
DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);
-/*
- * Lockless MCE logging infrastructure.
- * This avoids deadlocks on printk locks without having to break locks. Also
- * separate MCEs from kernel messages to avoid bogus bug reports.
- */
-
-static struct mce_log mcelog = {
- .signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
-};
-
-void mce_log(struct mce *mce)
+void mce_log(struct mce *m)
{
- unsigned next, entry;
-
- /* Emit the trace record: */
- trace_mce_record(mce);
-
- if (!mce_gen_pool_add(mce))
+ if (!mce_gen_pool_add(m))
irq_work_queue(&mce_irq_work);
-
- wmb();
- for (;;) {
- entry = mce_log_get_idx_check(mcelog.next);
- for (;;) {
-
- /*
- * When the buffer fills up discard new entries.
- * Assume that the earlier errors are the more
- * interesting ones:
- */
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW,
- (unsigned long *)&mcelog.flags);
- return;
- }
- /* Old left over entry. Skip: */
- if (mcelog.entry[entry].finished) {
- entry++;
- continue;
- }
- break;
- }
- smp_rmb();
- next = entry + 1;
- if (cmpxchg(&mcelog.next, entry, next) == entry)
- break;
- }
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- wmb();
- mcelog.entry[entry].finished = 1;
- wmb();
-
- set_bit(0, &mce_need_notify);
}
void mce_inject_log(struct mce *m)
{
- mutex_lock(&mce_chrdev_read_mutex);
+ mutex_lock(&mce_log_mutex);
mce_log(m);
- mutex_unlock(&mce_chrdev_read_mutex);
+ mutex_unlock(&mce_log_mutex);
}
EXPORT_SYMBOL_GPL(mce_inject_log);
static struct notifier_block mce_srao_nb;
+/*
+ * We run the default notifier if we have only the SRAO, the first and the
+ * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
+ * notifiers registered on the chain.
+ */
+#define NUM_DEFAULT_NOTIFIERS 3
static atomic_t num_notifiers;
void mce_register_decode_chain(struct notifier_block *nb)
{
- atomic_inc(&num_notifiers);
+ if (WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC))
+ return;
- WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
+ atomic_inc(&num_notifiers);
atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
@@ -521,7 +463,6 @@ static void mce_schedule_work(void)
static void mce_irq_work_cb(struct irq_work *entry)
{
- mce_notify_irq();
mce_schedule_work();
}
@@ -564,6 +505,78 @@ static int mce_usable_address(struct mce *m)
return 1;
}
+static bool memory_error(struct mce *m)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ /* ErrCodeExt[20:16] */
+ u8 xec = (m->status >> 16) & 0x1f;
+
+ return (xec == 0x0 || xec == 0x8);
+ } else if (c->x86_vendor == X86_VENDOR_INTEL) {
+ /*
+ * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+ *
+ * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+ * indicating a memory error. Bit 8 is used for indicating a
+ * cache hierarchy error. The combination of bit 2 and bit 3
+ * is used for indicating a `generic' cache hierarchy error
+ * But we can't just blindly check the above bits, because if
+ * bit 11 is set, then it is a bus/interconnect error - and
+ * either way the above bits just gives more detail on what
+ * bus/interconnect error happened. Note that bit 12 can be
+ * ignored, as it's the "filter" bit.
+ */
+ return (m->status & 0xef80) == BIT(7) ||
+ (m->status & 0xef00) == BIT(8) ||
+ (m->status & 0xeffc) == 0xc;
+ }
+
+ return false;
+}
+
+static bool cec_add_mce(struct mce *m)
+{
+ if (!m)
+ return false;
+
+ /* We eat only correctable DRAM errors with usable addresses. */
+ if (memory_error(m) &&
+ !(m->status & MCI_STATUS_UC) &&
+ mce_usable_address(m))
+ if (!cec_add_elem(m->addr >> PAGE_SHIFT))
+ return true;
+
+ return false;
+}
+
+static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ if (cec_add_mce(m))
+ return NOTIFY_STOP;
+
+ /* Emit the trace record: */
+ trace_mce_record(m);
+
+ set_bit(0, &mce_need_notify);
+
+ mce_notify_irq();
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block first_nb = {
+ .notifier_call = mce_first_notifier,
+ .priority = MCE_PRIO_FIRST,
+};
+
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
@@ -593,15 +606,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
if (!m)
return NOTIFY_DONE;
- /*
- * Run the default notifier if we have only the SRAO
- * notifier and us registered.
- */
- if (atomic_read(&num_notifiers) > 2)
- return NOTIFY_DONE;
-
- /* Don't print when mcelog is running */
- if (mce_chrdev_open_count > 0)
+ if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
return NOTIFY_DONE;
__print_mce(m);
@@ -654,37 +659,6 @@ static void mce_read_aux(struct mce *m, int i)
}
}
-static bool memory_error(struct mce *m)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86_vendor == X86_VENDOR_AMD) {
- /* ErrCodeExt[20:16] */
- u8 xec = (m->status >> 16) & 0x1f;
-
- return (xec == 0x0 || xec == 0x8);
- } else if (c->x86_vendor == X86_VENDOR_INTEL) {
- /*
- * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
- *
- * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
- * indicating a memory error. Bit 8 is used for indicating a
- * cache hierarchy error. The combination of bit 2 and bit 3
- * is used for indicating a `generic' cache hierarchy error
- * But we can't just blindly check the above bits, because if
- * bit 11 is set, then it is a bus/interconnect error - and
- * either way the above bits just gives more detail on what
- * bus/interconnect error happened. Note that bit 12 can be
- * ignored, as it's the "filter" bit.
- */
- return (m->status & 0xef80) == BIT(7) ||
- (m->status & 0xef00) == BIT(8) ||
- (m->status & 0xeffc) == 0xc;
- }
-
- return false;
-}
-
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
@@ -1133,9 +1107,22 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* on Intel.
*/
int lmce = 1;
+ int cpu = smp_processor_id();
- /* If this CPU is offline, just bail out. */
- if (cpu_is_offline(smp_processor_id())) {
+ /*
+ * Cases where we avoid rendezvous handler timeout:
+ * 1) If this CPU is offline.
+ *
+ * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
+ * skip those CPUs which remain looping in the 1st kernel - see
+ * crash_nmi_callback().
+ *
+ * Note: there still is a small window between kexec-ing and the new,
+ * kdump kernel establishing a new #MC handler where a broadcasted MCE
+ * might not get handled properly.
+ */
+ if (cpu_is_offline(cpu) ||
+ (crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
@@ -1405,13 +1392,6 @@ static void mce_timer_delete_all(void)
del_timer_sync(&per_cpu(mce_timer, cpu));
}
-static void mce_do_trigger(struct work_struct *work)
-{
- call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
-}
-
-static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
-
/*
* Notify the user(s) about new machine check events.
* Can be called from interrupt context, but not from machine check/NMI
@@ -1423,11 +1403,7 @@ int mce_notify_irq(void)
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
if (test_and_clear_bit(0, &mce_need_notify)) {
- /* wake processes polling /dev/mcelog */
- wake_up_interruptible(&mce_chrdev_wait);
-
- if (mce_helper[0])
- schedule_work(&mce_trigger_work);
+ mce_work_trigger();
if (__ratelimit(&ratelimit))
pr_info(HW_ERR "Machine check events logged\n");
@@ -1694,30 +1670,35 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
return 0;
}
-static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+/*
+ * Init basic CPU features needed for early decoding of MCEs.
+ */
+static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
{
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- mce_intel_feature_init(c);
- mce_adjust_timer = cmci_intel_adjust_timer;
- break;
-
- case X86_VENDOR_AMD: {
+ if (c->x86_vendor == X86_VENDOR_AMD) {
mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
- /*
- * Install proper ops for Scalable MCA enabled processors
- */
if (mce_flags.smca) {
msr_ops.ctl = smca_ctl_reg;
msr_ops.status = smca_status_reg;
msr_ops.addr = smca_addr_reg;
msr_ops.misc = smca_misc_reg;
}
- mce_amd_feature_init(c);
+ }
+}
+static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_init(c);
+ mce_adjust_timer = cmci_intel_adjust_timer;
+ break;
+
+ case X86_VENDOR_AMD: {
+ mce_amd_feature_init(c);
break;
}
@@ -1804,6 +1785,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
machine_check_vector = do_machine_check;
+ __mcheck_cpu_init_early(c);
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_clear_banks();
@@ -1829,251 +1811,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c)
}
-/*
- * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
- */
-
-static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int mce_chrdev_open_exclu; /* already open exclusive? */
-
-static int mce_chrdev_open(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_chrdev_state_lock);
-
- if (mce_chrdev_open_exclu ||
- (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_chrdev_state_lock);
-
- return -EBUSY;
- }
-
- if (file->f_flags & O_EXCL)
- mce_chrdev_open_exclu = 1;
- mce_chrdev_open_count++;
-
- spin_unlock(&mce_chrdev_state_lock);
-
- return nonseekable_open(inode, file);
-}
-
-static int mce_chrdev_release(struct inode *inode, struct file *file)
-{
- spin_lock(&mce_chrdev_state_lock);
-
- mce_chrdev_open_count--;
- mce_chrdev_open_exclu = 0;
-
- spin_unlock(&mce_chrdev_state_lock);
-
- return 0;
-}
-
-static void collect_tscs(void *data)
-{
- unsigned long *cpu_tsc = (unsigned long *)data;
-
- cpu_tsc[smp_processor_id()] = rdtsc();
-}
-
-static int mce_apei_read_done;
-
-/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
-static int __mce_read_apei(char __user **ubuf, size_t usize)
-{
- int rc;
- u64 record_id;
- struct mce m;
-
- if (usize < sizeof(struct mce))
- return -EINVAL;
-
- rc = apei_read_mce(&m, &record_id);
- /* Error or no more MCE record */
- if (rc <= 0) {
- mce_apei_read_done = 1;
- /*
- * When ERST is disabled, mce_chrdev_read() should return
- * "no record" instead of "no device."
- */
- if (rc == -ENODEV)
- return 0;
- return rc;
- }
- rc = -EFAULT;
- if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
- return rc;
- /*
- * In fact, we should have cleared the record after that has
- * been flushed to the disk or sent to network in
- * /sbin/mcelog, but we have no interface to support that now,
- * so just clear it to avoid duplication.
- */
- rc = apei_clear_mce(record_id);
- if (rc) {
- mce_apei_read_done = 1;
- return rc;
- }
- *ubuf += sizeof(struct mce);
-
- return 0;
-}
-
-static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
- size_t usize, loff_t *off)
-{
- char __user *buf = ubuf;
- unsigned long *cpu_tsc;
- unsigned prev, next;
- int i, err;
-
- cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
- if (!cpu_tsc)
- return -ENOMEM;
-
- mutex_lock(&mce_chrdev_read_mutex);
-
- if (!mce_apei_read_done) {
- err = __mce_read_apei(&buf, usize);
- if (err || buf != ubuf)
- goto out;
- }
-
- next = mce_log_get_idx_check(mcelog.next);
-
- /* Only supports full reads right now */
- err = -EINVAL;
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
- goto out;
-
- err = 0;
- prev = 0;
- do {
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
- struct mce *m = &mcelog.entry[i];
-
- while (!m->finished) {
- if (time_after_eq(jiffies, start + 2)) {
- memset(m, 0, sizeof(*m));
- goto timeout;
- }
- cpu_relax();
- }
- smp_rmb();
- err |= copy_to_user(buf, m, sizeof(*m));
- buf += sizeof(*m);
-timeout:
- ;
- }
-
- memset(mcelog.entry + prev, 0,
- (next - prev) * sizeof(struct mce));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
-
- synchronize_sched();
-
- /*
- * Collect entries that were still getting written before the
- * synchronize.
- */
- on_each_cpu(collect_tscs, cpu_tsc, 1);
-
- for (i = next; i < MCE_LOG_LEN; i++) {
- struct mce *m = &mcelog.entry[i];
-
- if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
- err |= copy_to_user(buf, m, sizeof(*m));
- smp_rmb();
- buf += sizeof(*m);
- memset(m, 0, sizeof(*m));
- }
- }
-
- if (err)
- err = -EFAULT;
-
-out:
- mutex_unlock(&mce_chrdev_read_mutex);
- kfree(cpu_tsc);
-
- return err ? err : buf - ubuf;
-}
-
-static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
-{
- poll_wait(file, &mce_chrdev_wait, wait);
- if (READ_ONCE(mcelog.next))
- return POLLIN | POLLRDNORM;
- if (!mce_apei_read_done && apei_check_mce())
- return POLLIN | POLLRDNORM;
- return 0;
-}
-
-static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
- unsigned long arg)
-{
- int __user *p = (int __user *)arg;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case MCE_GET_RECORD_LEN:
- return put_user(sizeof(struct mce), p);
- case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog.flags;
- } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
-
- return put_user(flags, p);
- }
- default:
- return -ENOTTY;
- }
-}
-
-static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off);
-
-void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
- const char __user *ubuf,
- size_t usize, loff_t *off))
-{
- mce_write = fn;
-}
-EXPORT_SYMBOL_GPL(register_mce_write_callback);
-
-static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
- size_t usize, loff_t *off)
-{
- if (mce_write)
- return mce_write(filp, ubuf, usize, off);
- else
- return -EINVAL;
-}
-
-static const struct file_operations mce_chrdev_ops = {
- .open = mce_chrdev_open,
- .release = mce_chrdev_release,
- .read = mce_chrdev_read,
- .write = mce_chrdev_write,
- .poll = mce_chrdev_poll,
- .unlocked_ioctl = mce_chrdev_ioctl,
- .llseek = no_llseek,
-};
-
-static struct miscdevice mce_chrdev_device = {
- MISC_MCELOG_MINOR,
- "mcelog",
- &mce_chrdev_ops,
-};
-
static void __mce_disable_bank(void *arg)
{
int bank = *((int *)arg);
@@ -2147,6 +1884,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
+ mce_register_decode_chain(&first_nb);
mce_register_decode_chain(&mce_srao_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
@@ -2291,29 +2029,6 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
return size;
}
-static ssize_t
-show_trigger(struct device *s, struct device_attribute *attr, char *buf)
-{
- strcpy(buf, mce_helper);
- strcat(buf, "\n");
- return strlen(mce_helper) + 1;
-}
-
-static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
- const char *buf, size_t siz)
-{
- char *p;
-
- strncpy(mce_helper, buf, sizeof(mce_helper));
- mce_helper[sizeof(mce_helper)-1] = 0;
- p = strchr(mce_helper, '\n');
-
- if (p)
- *p = 0;
-
- return strlen(mce_helper) + !!p;
-}
-
static ssize_t set_ignore_ce(struct device *s,
struct device_attribute *attr,
const char *buf, size_t size)
@@ -2370,7 +2085,6 @@ static ssize_t store_int_with_restart(struct device *s,
return ret;
}
-static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
@@ -2393,7 +2107,9 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
static struct device_attribute *mce_device_attrs[] = {
&dev_attr_tolerant.attr,
&dev_attr_check_interval.attr,
+#ifdef CONFIG_X86_MCELOG_LEGACY
&dev_attr_trigger,
+#endif
&dev_attr_monarch_timeout.attr,
&dev_attr_dont_log_ce.attr,
&dev_attr_ignore_ce.attr,
@@ -2567,7 +2283,6 @@ static __init void mce_init_banks(void)
static __init int mcheck_init_device(void)
{
- enum cpuhp_state hp_online;
int err;
if (!mce_available(&boot_cpu_data)) {
@@ -2595,21 +2310,11 @@ static __init int mcheck_init_device(void)
mce_cpu_online, mce_cpu_pre_down);
if (err < 0)
goto err_out_online;
- hp_online = err;
register_syscore_ops(&mce_syscore_ops);
- /* register character device /dev/mcelog */
- err = misc_register(&mce_chrdev_device);
- if (err)
- goto err_register;
-
return 0;
-err_register:
- unregister_syscore_ops(&mce_syscore_ops);
- cpuhp_remove_state(hp_online);
-
err_out_online:
cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
@@ -2617,7 +2322,7 @@ err_out_mem:
free_cpumask_var(mce_device_initialized);
err_out:
- pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ pr_err("Unable to init MCE device (rc: %d)\n", err);
return err;
}
@@ -2696,6 +2401,7 @@ static int __init mcheck_late_init(void)
static_branch_inc(&mcsafe_key);
mcheck_debugfs_init();
+ cec_init();
/*
* Flush out everything that has been logged during early boot, now that
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 7889ae492af0..1d38e53c2d5c 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -352,8 +352,6 @@ void reload_ucode_amd(void)
u32 rev, dummy;
mc = (struct microcode_amd *)amd_ucode_patch;
- if (!mc)
- return;
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 18ca99f2798b..6df621ae62a7 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -31,14 +31,13 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
"fpu\t\t: %s\n"
"fpu_exception\t: %s\n"
"cpuid level\t: %d\n"
- "wp\t\t: %s\n",
+ "wp\t\t: yes\n",
static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
- c->cpuid_level,
- c->wp_works_ok ? "yes" : "no");
+ c->cpuid_level);
}
#else
static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 09d4ac0d2661..924f45ea4382 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -289,9 +289,6 @@ void die(const char *str, struct pt_regs *regs, long err)
unsigned long flags = oops_begin();
int sig = SIGSEGV;
- if (!user_mode(regs))
- report_bug(regs->ip, regs);
-
if (__die(str, regs, err))
sig = 0;
oops_end(flags, regs, sig);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b0b3a3df7c20..e5f0b40e66d2 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -162,15 +162,3 @@ void show_regs(struct pt_regs *regs)
}
pr_cont("\n");
}
-
-int is_valid_bugaddr(unsigned long ip)
-{
- unsigned short ud2;
-
- if (ip < PAGE_OFFSET)
- return 0;
- if (probe_kernel_address((unsigned short *)ip, ud2))
- return 0;
-
- return ud2 == 0x0b0f;
-}
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a8b117e93b46..3e1471d57487 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -178,13 +178,3 @@ void show_regs(struct pt_regs *regs)
}
pr_cont("\n");
}
-
-int is_valid_bugaddr(unsigned long ip)
-{
- unsigned short ud2;
-
- if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
- return 0;
-
- return ud2 == 0x0b0f;
-}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 8a121991e5ba..0f0840304452 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -17,6 +17,7 @@
#include <asm/intel-mid.h>
#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
+#include <linux/usb/xhci-dbgp.h>
#include <linux/efi.h>
#include <asm/efi.h>
#include <asm/pci_x86.h>
@@ -381,6 +382,10 @@ static int __init setup_early_printk(char *buf)
if (!strncmp(buf, "efi", 3))
early_console_register(&early_efi_console, keep);
#endif
+#ifdef CONFIG_EARLY_PRINTK_USB_XDBC
+ if (!strncmp(buf, "xdbc", 4))
+ early_xdbc_parse_parameter(buf + 4);
+#endif
buf++;
}
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 04f89caef9c4..8e598a1ad986 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -50,11 +50,11 @@
#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
/* There is address space for how many espfix pages? */
-#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
+#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16))
#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
-# error "Need more than one PGD for the ESPFIX hack"
+# error "Need more virtual address space for the ESPFIX hack"
#endif
#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
@@ -121,11 +121,13 @@ static void init_espfix_random(void)
void __init init_espfix_bsp(void)
{
- pgd_t *pgd_p;
+ pgd_t *pgd;
+ p4d_t *p4d;
/* Install the espfix pud into the kernel page directory */
- pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
- pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+ pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
+ p4d_populate(&init_mm, p4d, espfix_pud_page);
/* Randomize the locations */
init_espfix_random();
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
new file mode 100644
index 000000000000..07f40359c9ea
--- /dev/null
+++ b/arch/x86/kernel/ftrace_32.S
@@ -0,0 +1,244 @@
+/*
+ * Copyright (C) 2017 Steven Rostedt, VMware Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+#include <asm/segment.h>
+#include <asm/export.h>
+#include <asm/ftrace.h>
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+EXPORT_SYMBOL(__fentry__)
+#else
+# define function_hook mcount
+EXPORT_SYMBOL(mcount)
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+/* mcount uses a frame pointer even if CONFIG_FRAME_POINTER is not set */
+#if !defined(CC_USING_FENTRY) || defined(CONFIG_FRAME_POINTER)
+# define USING_FRAME_POINTER
+#endif
+
+#ifdef USING_FRAME_POINTER
+# define MCOUNT_FRAME 1 /* using frame = true */
+#else
+# define MCOUNT_FRAME 0 /* using frame = false */
+#endif
+
+ENTRY(function_hook)
+ ret
+END(function_hook)
+
+ENTRY(ftrace_caller)
+
+#ifdef USING_FRAME_POINTER
+# ifdef CC_USING_FENTRY
+ /*
+ * Frame pointers are of ip followed by bp.
+ * Since fentry is an immediate jump, we are left with
+ * parent-ip, function-ip. We need to add a frame with
+ * parent-ip followed by ebp.
+ */
+ pushl 4(%esp) /* parent ip */
+ pushl %ebp
+ movl %esp, %ebp
+ pushl 2*4(%esp) /* function ip */
+# endif
+ /* For mcount, the function ip is directly above */
+ pushl %ebp
+ movl %esp, %ebp
+#endif
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ pushl $0 /* Pass NULL as regs pointer */
+
+#ifdef USING_FRAME_POINTER
+ /* Load parent ebp into edx */
+ movl 4*4(%esp), %edx
+#else
+ /* There's no frame pointer, load the appropriate stack addr instead */
+ lea 4*4(%esp), %edx
+#endif
+
+ movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */
+ /* Get the parent ip */
+ movl 4(%edx), %edx /* edx has ebp */
+
+ movl function_trace_op, %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ addl $4, %esp /* skip NULL pointer */
+ popl %edx
+ popl %ecx
+ popl %eax
+#ifdef USING_FRAME_POINTER
+ popl %ebp
+# ifdef CC_USING_FENTRY
+ addl $4,%esp /* skip function ip */
+ popl %ebp /* this is the orig bp */
+ addl $4, %esp /* skip parent ip */
+# endif
+#endif
+.Lftrace_ret:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
+
+/* This is weak to keep gas from relaxing the jumps */
+WEAK(ftrace_stub)
+ ret
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ /*
+ * i386 does not save SS and ESP when coming from kernel.
+ * Instead, to get sp, &regs->sp is used (see ptrace.h).
+ * Unfortunately, that means eflags must be at the same location
+ * as the current return ip is. We move the return ip into the
+ * regs->ip location, and move flags into the return ip location.
+ */
+ pushl $__KERNEL_CS
+ pushl 4(%esp) /* Save the return ip */
+ pushl $0 /* Load 0 into orig_ax */
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+
+ /* Get flags and place them into the return ip slot */
+ pushf
+ popl %eax
+ movl %eax, 8*4(%esp)
+
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+#ifdef CC_USING_FENTRY
+ /* Load 4 off of the parent ip addr into ebp */
+ lea 14*4(%esp), %ebp
+#endif
+
+ movl 12*4(%esp), %eax /* Load ip (1st parameter) */
+ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
+ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
+ movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ pushl %esp /* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ addl $4, %esp /* Skip pt_regs */
+
+ /* restore flags */
+ push 14*4(%esp)
+ popf
+
+ /* Move return ip back to its original location */
+ movl 12*4(%esp), %eax
+ movl %eax, 14*4(%esp)
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ popl %ds
+ popl %es
+ popl %fs
+ popl %gs
+
+ /* use lea to not affect flags */
+ lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */
+
+ jmp .Lftrace_ret
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(function_hook)
+ cmpl $__PAGE_OFFSET, %esp
+ jb ftrace_stub /* Paging not enabled yet? */
+
+ cmpl $ftrace_stub, ftrace_trace_function
+ jnz .Ltrace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpl $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+.globl ftrace_stub
+ftrace_stub:
+ ret
+
+ /* taken from glibc */
+.Ltrace:
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ movl 0x4(%ebp), %edx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+ call *ftrace_trace_function
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ jmp ftrace_stub
+END(function_hook)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 3*4(%esp), %eax
+ /* Even with frame pointers, fentry doesn't have one here */
+#ifdef CC_USING_FENTRY
+ lea 4*4(%esp), %edx
+ movl $0, %ecx
+#else
+ lea 0x4(%ebp), %edx
+ movl (%ebp), %ecx
+#endif
+ subl $MCOUNT_INSN_SIZE, %eax
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ pushl %eax
+ pushl %edx
+#ifdef CC_USING_FENTRY
+ movl $0, %eax
+#else
+ movl %ebp, %eax
+#endif
+ call ftrace_return_to_handler
+ movl %eax, %ecx
+ popl %edx
+ popl %eax
+ jmp *%ecx
+#endif
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/ftrace_64.S
index 7b0d3da52fb4..1dfac634bbf7 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -1,6 +1,4 @@
/*
- * linux/arch/x86_64/mcount_64.S
- *
* Copyright (C) 2014 Steven Rostedt, Red Hat Inc
*/
@@ -13,9 +11,6 @@
.code64
.section .entry.text, "ax"
-
-#ifdef CONFIG_FUNCTION_TRACER
-
#ifdef CC_USING_FENTRY
# define function_hook __fentry__
EXPORT_SYMBOL(__fentry__)
@@ -297,7 +292,6 @@ trace:
jmp fgraph_trace
END(function_hook)
#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b467b14b03eb..ac9d327d2e42 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -269,10 +269,8 @@ ENTRY(secondary_startup_64)
/* rsi is pointer to real mode structure with interesting info.
pass it to C */
movq %rsi, %rdi
- jmp start_cpu
-ENDPROC(secondary_startup_64)
-ENTRY(start_cpu)
+.Ljump_to_C_code:
/*
* Jump to run C code and to be on a real kernel address.
* Since we are running on identity-mapped space we have to jump
@@ -305,7 +303,7 @@ ENTRY(start_cpu)
pushq %rax # target address in negative space
lretq
.Lafter_lret:
-ENDPROC(start_cpu)
+ENDPROC(secondary_startup_64)
#include "verify_cpu.S"
@@ -313,11 +311,11 @@ ENDPROC(start_cpu)
/*
* Boot CPU0 entry point. It's called from play_dead(). Everything has been set
* up already except stack. We just set up stack here. Then call
- * start_secondary() via start_cpu().
+ * start_secondary() via .Ljump_to_C_code.
*/
ENTRY(start_cpu0)
movq initial_stack(%rip), %rsp
- jmp start_cpu
+ jmp .Ljump_to_C_code
ENDPROC(start_cpu0)
#endif
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 469b23d6acc2..5f43cec296c5 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -103,6 +103,7 @@ static void machine_kexec_page_table_set_one(
pgd_t *pgd, pmd_t *pmd, pte_t *pte,
unsigned long vaddr, unsigned long paddr)
{
+ p4d_t *p4d;
pud_t *pud;
pgd += pgd_index(vaddr);
@@ -110,7 +111,8 @@ static void machine_kexec_page_table_set_one(
if (!(pgd_val(*pgd) & _PAGE_PRESENT))
set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
#endif
- pud = pud_offset(pgd, vaddr);
+ p4d = p4d_offset(pgd, vaddr);
+ pud = pud_offset(p4d, vaddr);
pmd = pmd_offset(pud, vaddr);
if (!(pmd_val(*pmd) & _PAGE_PRESENT))
set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 857cdbd02867..085c3b300d32 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,6 +36,7 @@ static struct kexec_file_ops *kexec_file_loaders[] = {
static void free_transition_pgtable(struct kimage *image)
{
+ free_page((unsigned long)image->arch.p4d);
free_page((unsigned long)image->arch.pud);
free_page((unsigned long)image->arch.pmd);
free_page((unsigned long)image->arch.pte);
@@ -43,6 +44,7 @@ static void free_transition_pgtable(struct kimage *image)
static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
{
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -53,13 +55,21 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
+ if (!p4d)
+ goto err;
+ image->arch.p4d = p4d;
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (!p4d_present(*p4d)) {
pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
if (!pud)
goto err;
image->arch.pud = pud;
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
- pud = pud_offset(pgd, vaddr);
+ pud = pud_offset(p4d, vaddr);
if (!pud_present(*pud)) {
pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 4797e87b0fb6..3586996fc50d 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -405,9 +405,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
.alloc_pud = paravirt_nop,
+ .alloc_p4d = paravirt_nop,
.release_pte = paravirt_nop,
.release_pmd = paravirt_nop,
.release_pud = paravirt_nop,
+ .release_p4d = paravirt_nop,
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
@@ -430,12 +432,19 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.pmd_val = PTE_IDENT,
.make_pmd = PTE_IDENT,
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
.pud_val = PTE_IDENT,
.make_pud = PTE_IDENT,
+ .set_p4d = native_set_p4d,
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+ .p4d_val = PTE_IDENT,
+ .make_p4d = PTE_IDENT,
+
.set_pgd = native_set_pgd,
-#endif
+#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
.pte_val = PTE_IDENT,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 0c150c06fa5a..fda7867046d0 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1007,9 +1007,8 @@ static void __init calgary_enable_translation(struct pci_dev *dev)
writel(cpu_to_be32(val32), target);
readl(target); /* flush */
- init_timer(&tbl->watchdog_timer);
- tbl->watchdog_timer.function = &calgary_watchdog;
- tbl->watchdog_timer.data = (unsigned long)dev;
+ setup_timer(&tbl->watchdog_timer, &calgary_watchdog,
+ (unsigned long)dev);
mod_timer(&tbl->watchdog_timer, jiffies);
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f67591561711..0bb88428cbf2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -37,6 +37,7 @@
#include <asm/vm86.h>
#include <asm/switch_to.h>
#include <asm/desc.h>
+#include <asm/prctl.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -124,11 +125,6 @@ void flush_thread(void)
fpu__clear(&tsk->thread.fpu);
}
-static void hard_disable_TSC(void)
-{
- cr4_set_bits(X86_CR4_TSD);
-}
-
void disable_TSC(void)
{
preempt_disable();
@@ -137,15 +133,10 @@ void disable_TSC(void)
* Must flip the CPU state synchronously with
* TIF_NOTSC in the current running context.
*/
- hard_disable_TSC();
+ cr4_set_bits(X86_CR4_TSD);
preempt_enable();
}
-static void hard_enable_TSC(void)
-{
- cr4_clear_bits(X86_CR4_TSD);
-}
-
static void enable_TSC(void)
{
preempt_disable();
@@ -154,7 +145,7 @@ static void enable_TSC(void)
* Must flip the CPU state synchronously with
* TIF_NOTSC in the current running context.
*/
- hard_enable_TSC();
+ cr4_clear_bits(X86_CR4_TSD);
preempt_enable();
}
@@ -182,54 +173,129 @@ int set_tsc_mode(unsigned int val)
return 0;
}
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
- struct tss_struct *tss)
-{
- struct thread_struct *prev, *next;
-
- prev = &prev_p->thread;
- next = &next_p->thread;
+DEFINE_PER_CPU(u64, msr_misc_features_shadow);
- if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
- test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
+static void set_cpuid_faulting(bool on)
+{
+ u64 msrval;
- debugctl &= ~DEBUGCTLMSR_BTF;
- if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
- debugctl |= DEBUGCTLMSR_BTF;
+ msrval = this_cpu_read(msr_misc_features_shadow);
+ msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+ msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+ this_cpu_write(msr_misc_features_shadow, msrval);
+ wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
+}
- update_debugctlmsr(debugctl);
+static void disable_cpuid(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOCPUID)) {
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
+ */
+ set_cpuid_faulting(true);
}
+ preempt_enable();
+}
- if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
- test_tsk_thread_flag(next_p, TIF_NOTSC)) {
- /* prev and next are different */
- if (test_tsk_thread_flag(next_p, TIF_NOTSC))
- hard_disable_TSC();
- else
- hard_enable_TSC();
+static void enable_cpuid(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOCPUID)) {
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOCPUID in the current running context.
+ */
+ set_cpuid_faulting(false);
}
+ preempt_enable();
+}
+
+static int get_cpuid_mode(void)
+{
+ return !test_thread_flag(TIF_NOCPUID);
+}
+
+static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
+{
+ if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
+ return -ENODEV;
+
+ if (cpuid_enabled)
+ enable_cpuid();
+ else
+ disable_cpuid();
+
+ return 0;
+}
+
+/*
+ * Called immediately after a successful exec.
+ */
+void arch_setup_new_exec(void)
+{
+ /* If cpuid was previously disabled for this task, re-enable it. */
+ if (test_thread_flag(TIF_NOCPUID))
+ enable_cpuid();
+}
- if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+static inline void switch_to_bitmap(struct tss_struct *tss,
+ struct thread_struct *prev,
+ struct thread_struct *next,
+ unsigned long tifp, unsigned long tifn)
+{
+ if (tifn & _TIF_IO_BITMAP) {
/*
* Copy the relevant range of the IO bitmap.
* Normally this is 128 bytes or less:
*/
memcpy(tss->io_bitmap, next->io_bitmap_ptr,
max(prev->io_bitmap_max, next->io_bitmap_max));
-
/*
* Make sure that the TSS limit is correct for the CPU
* to notice the IO bitmap.
*/
refresh_tss_limit();
- } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+ } else if (tifp & _TIF_IO_BITMAP) {
/*
* Clear any possible leftover bits:
*/
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
+}
+
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
+ struct tss_struct *tss)
+{
+ struct thread_struct *prev, *next;
+ unsigned long tifp, tifn;
+
+ prev = &prev_p->thread;
+ next = &next_p->thread;
+
+ tifn = READ_ONCE(task_thread_info(next_p)->flags);
+ tifp = READ_ONCE(task_thread_info(prev_p)->flags);
+ switch_to_bitmap(tss, prev, next, tifp, tifn);
+
propagate_user_return_notify(prev_p, next_p);
+
+ if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
+ arch_has_block_step()) {
+ unsigned long debugctl, msk;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ msk = tifn & _TIF_BLOCKSTEP;
+ debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ }
+
+ if ((tifp ^ tifn) & _TIF_NOTSC)
+ cr4_toggle_bits(X86_CR4_TSD);
+
+ if ((tifp ^ tifn) & _TIF_NOCPUID)
+ set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
}
/*
@@ -550,3 +616,16 @@ out:
put_task_stack(p);
return ret;
}
+
+long do_arch_prctl_common(struct task_struct *task, int option,
+ unsigned long cpuid_enabled)
+{
+ switch (option) {
+ case ARCH_GET_CPUID:
+ return get_cpuid_mode();
+ case ARCH_SET_CPUID:
+ return set_cpuid_mode(task, cpuid_enabled);
+ }
+
+ return -EINVAL;
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4c818f8bc135..ff40e74c9181 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/kdebug.h>
+#include <linux/syscalls.h>
#include <asm/pgtable.h>
#include <asm/ldt.h>
@@ -56,6 +57,7 @@
#include <asm/switch_to.h>
#include <asm/vm86.h>
#include <asm/intel_rdt.h>
+#include <asm/proto.h>
void __show_regs(struct pt_regs *regs, int all)
{
@@ -304,3 +306,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
return prev_p;
}
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ return do_arch_prctl_common(current, option, arg2);
+}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d6b784a5520d..825a1e47cf3e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,6 +37,7 @@
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/ftrace.h>
+#include <linux/syscalls.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -52,6 +53,11 @@
#include <asm/xen/hypervisor.h>
#include <asm/vdso.h>
#include <asm/intel_rdt.h>
+#include <asm/unistd.h>
+#ifdef CONFIG_IA32_EMULATION
+/* Not included via unistd.h */
+#include <asm/unistd_32_ia32.h>
+#endif
__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
@@ -204,7 +210,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
(struct user_desc __user *)tls, 0);
else
#endif
- err = do_arch_prctl(p, ARCH_SET_FS, tls);
+ err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
if (err)
goto out;
}
@@ -493,6 +499,8 @@ void set_personality_64bit(void)
clear_thread_flag(TIF_IA32);
clear_thread_flag(TIF_ADDR32);
clear_thread_flag(TIF_X32);
+ /* Pretend that this comes from a 64bit execve */
+ task_pt_regs(current)->orig_ax = __NR_execve;
/* Ensure the corresponding mm is not marked. */
if (current->mm)
@@ -505,32 +513,50 @@ void set_personality_64bit(void)
current->personality &= ~READ_IMPLIES_EXEC;
}
-void set_personality_ia32(bool x32)
+static void __set_personality_x32(void)
{
- /* inherit personality from parent */
+#ifdef CONFIG_X86_X32
+ clear_thread_flag(TIF_IA32);
+ set_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_X32;
+ current->personality &= ~READ_IMPLIES_EXEC;
+ /*
+ * in_compat_syscall() uses the presence of the x32 syscall bit
+ * flag to determine compat status. The x86 mmap() code relies on
+ * the syscall bitness so set x32 syscall bit right here to make
+ * in_compat_syscall() work during exec().
+ *
+ * Pretend to come from a x32 execve.
+ */
+ task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
+ current->thread.status &= ~TS_COMPAT;
+#endif
+}
+
+static void __set_personality_ia32(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+ set_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_IA32;
+ current->personality |= force_personality32;
+ /* Prepare the first "return" to user space */
+ task_pt_regs(current)->orig_ax = __NR_ia32_execve;
+ current->thread.status |= TS_COMPAT;
+#endif
+}
+void set_personality_ia32(bool x32)
+{
/* Make sure to be in 32bit mode */
set_thread_flag(TIF_ADDR32);
- /* Mark the associated mm as containing 32-bit tasks. */
- if (x32) {
- clear_thread_flag(TIF_IA32);
- set_thread_flag(TIF_X32);
- if (current->mm)
- current->mm->context.ia32_compat = TIF_X32;
- current->personality &= ~READ_IMPLIES_EXEC;
- /* in_compat_syscall() uses the presence of the x32
- syscall bit flag to determine compat status */
- current->thread.status &= ~TS_COMPAT;
- } else {
- set_thread_flag(TIF_IA32);
- clear_thread_flag(TIF_X32);
- if (current->mm)
- current->mm->context.ia32_compat = TIF_IA32;
- current->personality |= force_personality32;
- /* Prepare the first "return" to user space */
- current->thread.status |= TS_COMPAT;
- }
+ if (x32)
+ __set_personality_x32();
+ else
+ __set_personality_ia32();
}
EXPORT_SYMBOL_GPL(set_personality_ia32);
@@ -547,70 +573,72 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
}
#endif
-long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
+long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
{
int ret = 0;
int doit = task == current;
int cpu;
- switch (code) {
+ switch (option) {
case ARCH_SET_GS:
- if (addr >= TASK_SIZE_MAX)
+ if (arg2 >= TASK_SIZE_MAX)
return -EPERM;
cpu = get_cpu();
task->thread.gsindex = 0;
- task->thread.gsbase = addr;
+ task->thread.gsbase = arg2;
if (doit) {
load_gs_index(0);
- ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
+ ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
}
put_cpu();
break;
case ARCH_SET_FS:
/* Not strictly needed for fs, but do it for symmetry
with gs */
- if (addr >= TASK_SIZE_MAX)
+ if (arg2 >= TASK_SIZE_MAX)
return -EPERM;
cpu = get_cpu();
task->thread.fsindex = 0;
- task->thread.fsbase = addr;
+ task->thread.fsbase = arg2;
if (doit) {
/* set the selector to 0 to not confuse __switch_to */
loadsegment(fs, 0);
- ret = wrmsrl_safe(MSR_FS_BASE, addr);
+ ret = wrmsrl_safe(MSR_FS_BASE, arg2);
}
put_cpu();
break;
case ARCH_GET_FS: {
unsigned long base;
+
if (doit)
rdmsrl(MSR_FS_BASE, base);
else
base = task->thread.fsbase;
- ret = put_user(base, (unsigned long __user *)addr);
+ ret = put_user(base, (unsigned long __user *)arg2);
break;
}
case ARCH_GET_GS: {
unsigned long base;
+
if (doit)
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
base = task->thread.gsbase;
- ret = put_user(base, (unsigned long __user *)addr);
+ ret = put_user(base, (unsigned long __user *)arg2);
break;
}
#ifdef CONFIG_CHECKPOINT_RESTORE
# ifdef CONFIG_X86_X32_ABI
case ARCH_MAP_VDSO_X32:
- return prctl_map_vdso(&vdso_image_x32, addr);
+ return prctl_map_vdso(&vdso_image_x32, arg2);
# endif
# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
case ARCH_MAP_VDSO_32:
- return prctl_map_vdso(&vdso_image_32, addr);
+ return prctl_map_vdso(&vdso_image_32, arg2);
# endif
case ARCH_MAP_VDSO_64:
- return prctl_map_vdso(&vdso_image_64, addr);
+ return prctl_map_vdso(&vdso_image_64, arg2);
#endif
default:
@@ -621,10 +649,23 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
return ret;
}
-long sys_arch_prctl(int code, unsigned long addr)
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
{
- return do_arch_prctl(current, code, addr);
+ long ret;
+
+ ret = do_arch_prctl_64(current, option, arg2);
+ if (ret == -EINVAL)
+ ret = do_arch_prctl_common(current, option, arg2);
+
+ return ret;
+}
+
+#ifdef CONFIG_IA32_EMULATION
+COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ return do_arch_prctl_common(current, option, arg2);
}
+#endif
unsigned long KSTK_ESP(struct task_struct *task)
{
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2364b23ea3e5..f37d18124648 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -396,12 +396,12 @@ static int putreg(struct task_struct *child,
if (value >= TASK_SIZE_MAX)
return -EIO;
/*
- * When changing the segment base, use do_arch_prctl
+ * When changing the segment base, use do_arch_prctl_64
* to set either thread.fs or thread.fsindex and the
* corresponding GDT slot.
*/
if (child->thread.fsbase != value)
- return do_arch_prctl(child, ARCH_SET_FS, value);
+ return do_arch_prctl_64(child, ARCH_SET_FS, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
/*
@@ -410,7 +410,7 @@ static int putreg(struct task_struct *child,
if (value >= TASK_SIZE_MAX)
return -EIO;
if (child->thread.gsbase != value)
- return do_arch_prctl(child, ARCH_SET_GS, value);
+ return do_arch_prctl_64(child, ARCH_SET_GS, value);
return 0;
#endif
}
@@ -869,7 +869,7 @@ long arch_ptrace(struct task_struct *child, long request,
Works just like arch_prctl, except that the arguments
are reversed. */
case PTRACE_ARCH_PRCTL:
- ret = do_arch_prctl(child, data, addr);
+ ret = do_arch_prctl_64(child, data, addr);
break;
#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 067f9813fd2c..2544700a2a87 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -765,10 +765,11 @@ void machine_crash_shutdown(struct pt_regs *regs)
#endif
+/* This is the CPU performing the emergency shutdown work. */
+int crashing_cpu = -1;
+
#if defined(CONFIG_SMP)
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
static nmi_shootdown_cb shootdown_callback;
static atomic_t waiting_for_crash_ipi;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4bf0c8926a1c..ddc8f1ce45aa 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -70,6 +70,7 @@
#include <linux/tboot.h>
#include <linux/jiffies.h>
+#include <linux/usb/xhci-dbgp.h>
#include <video/edid.h>
#include <asm/mtrr.h>
@@ -173,14 +174,11 @@ static struct resource bss_resource = {
#ifdef CONFIG_X86_32
-/* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data = {
- .wp_works_ok = -1,
-};
+/* cpu data as detected by the assembly code in head_32.S */
+struct cpuinfo_x86 new_cpu_data;
+
/* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data __read_mostly = {
- .wp_works_ok = -1,
-};
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
EXPORT_SYMBOL(boot_cpu_data);
unsigned int def_to_bigsmp;
@@ -837,6 +835,26 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
return 0;
}
+static void __init simple_udelay_calibration(void)
+{
+ unsigned int tsc_khz, cpu_khz;
+ unsigned long lpj;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC))
+ return;
+
+ cpu_khz = x86_platform.calibrate_cpu();
+ tsc_khz = x86_platform.calibrate_tsc();
+
+ tsc_khz = tsc_khz ? : cpu_khz;
+ if (!tsc_khz)
+ return;
+
+ lpj = tsc_khz * 1000;
+ do_div(lpj, HZ);
+ loops_per_jiffy = lpj;
+}
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -985,6 +1003,8 @@ void __init setup_arch(char **cmdline_p)
*/
x86_configure_nx();
+ simple_udelay_calibration();
+
parse_early_param();
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1122,6 +1142,9 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
+ if (!early_xdbc_setup_hardware())
+ early_xdbc_register_console();
+
reserve_bios_regions();
if (efi_enabled(EFI_MEMMAP)) {
@@ -1226,21 +1249,6 @@ void __init setup_arch(char **cmdline_p)
kasan_init();
-#ifdef CONFIG_X86_32
- /* sync back kernel address range */
- clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
- swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- KERNEL_PGD_PTRS);
-
- /*
- * sync back low identity map too. It is used for example
- * in the 32-bit EFI stub.
- */
- clone_pgd_range(initial_page_table,
- swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
-#endif
-
tboot_probe();
map_vsyscall();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 9820d6d977c6..bb1e8cc0bc84 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -160,7 +160,7 @@ static inline void setup_percpu_segment(int cpu)
pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
0x2 | DESCTYPE_S, 0x8);
gdt.s = 1;
- write_gdt_entry(get_cpu_gdt_table(cpu),
+ write_gdt_entry(get_cpu_gdt_rw(cpu),
GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
#endif
}
@@ -288,4 +288,25 @@ void __init setup_per_cpu_areas(void)
/* Setup cpu initialized, callin, callout masks */
setup_cpu_local_masks();
+
+#ifdef CONFIG_X86_32
+ /*
+ * Sync back kernel address range. We want to make sure that
+ * all kernel mappings, including percpu mappings, are available
+ * in the smpboot asm. We can't reliably pick up percpu
+ * mappings using vmalloc_fault(), because exception dispatch
+ * needs percpu data.
+ */
+ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+
+ /*
+ * sync back low identity map too. It is used for example
+ * in the 32-bit EFI stub.
+ */
+ clone_pgd_range(initial_page_table,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
+#endif
}
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index ec1f756f9dc9..71beb28600d4 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -151,8 +151,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
if (from->si_signo == SIGSEGV) {
if (from->si_code == SEGV_BNDERR) {
- compat_uptr_t lower = (unsigned long)&to->si_lower;
- compat_uptr_t upper = (unsigned long)&to->si_upper;
+ compat_uptr_t lower = (unsigned long)from->si_lower;
+ compat_uptr_t upper = (unsigned long)from->si_upper;
put_user_ex(lower, &to->si_lower);
put_user_ex(upper, &to->si_upper);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bd1f1ad35284..f04479a8f74f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -983,7 +983,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
unsigned long timeout;
idle->thread.sp = (unsigned long)task_pt_regs(idle);
- early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 50215a4b9347..207b8f2582c7 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -17,6 +17,8 @@
#include <linux/uaccess.h>
#include <linux/elf.h>
+#include <asm/elf.h>
+#include <asm/compat.h>
#include <asm/ia32.h>
#include <asm/syscalls.h>
@@ -101,7 +103,7 @@ out:
static void find_start_end(unsigned long flags, unsigned long *begin,
unsigned long *end)
{
- if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) {
+ if (!in_compat_syscall() && (flags & MAP_32BIT)) {
/* This is usually used needed to map code in small
model, so it needs to be in the first 31bit. Limit
it to that. This means we need to move the
@@ -114,10 +116,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
if (current->flags & PF_RANDOMIZE) {
*begin = randomize_page(*begin, 0x02000000);
}
- } else {
- *begin = current->mm->mmap_legacy_base;
- *end = TASK_SIZE;
+ return;
}
+
+ *begin = get_mmap_base(1);
+ *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
}
unsigned long
@@ -176,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
/* for MAP_32BIT mappings we force the legacy mmap base */
- if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT))
+ if (!in_compat_syscall() && (flags & MAP_32BIT))
goto bottomup;
/* requesting a specific address */
@@ -191,7 +194,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
- info.high_limit = mm->mmap_base;
+ info.high_limit = get_mmap_base(0);
info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
if (filp) {
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index b868fa1b812b..5db0f33cbf2c 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
pgprot_t prot)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = pgd_offset(&tboot_mm, vaddr);
- pud = pud_alloc(&tboot_mm, pgd, vaddr);
+ p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
+ if (!p4d)
+ return -1;
+ pud = pud_alloc(&tboot_mm, p4d, vaddr);
if (!pud)
return -1;
pmd = pmd_alloc(&tboot_mm, pud, vaddr);
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 6c8934406dc9..dcd699baea1b 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -92,10 +92,17 @@ static void set_tls_desc(struct task_struct *p, int idx,
cpu = get_cpu();
while (n-- > 0) {
- if (LDT_empty(info) || LDT_zero(info))
+ if (LDT_empty(info) || LDT_zero(info)) {
desc->a = desc->b = 0;
- else
+ } else {
fill_ldt(desc, info);
+
+ /*
+ * Always set the accessed bit so that the CPU
+ * doesn't try to write to the (read-only) GDT.
+ */
+ desc->type |= 1;
+ }
++info;
++desc;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 948443e115c1..3c0751b120de 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -169,6 +169,37 @@ void ist_end_non_atomic(void)
preempt_disable();
}
+int is_valid_bugaddr(unsigned long addr)
+{
+ unsigned short ud;
+
+ if (addr < TASK_SIZE_MAX)
+ return 0;
+
+ if (probe_kernel_address((unsigned short *)addr, ud))
+ return 0;
+
+ return ud == INSN_UD0 || ud == INSN_UD2;
+}
+
+static int fixup_bug(struct pt_regs *regs, int trapnr)
+{
+ if (trapnr != X86_TRAP_UD)
+ return 0;
+
+ switch (report_bug(regs->ip, regs)) {
+ case BUG_TRAP_TYPE_NONE:
+ case BUG_TRAP_TYPE_BUG:
+ break;
+
+ case BUG_TRAP_TYPE_WARN:
+ regs->ip += LEN_UD0;
+ return 1;
+ }
+
+ return 0;
+}
+
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
struct pt_regs *regs, long error_code)
@@ -187,12 +218,15 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
}
if (!user_mode(regs)) {
- if (!fixup_exception(regs, trapnr)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = trapnr;
- die(str, regs, error_code);
- }
- return 0;
+ if (fixup_exception(regs, trapnr))
+ return 0;
+
+ if (fixup_bug(regs, trapnr))
+ return 0;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = trapnr;
+ die(str, regs, error_code);
}
return -1;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 23ee89ce59a9..62597c300d94 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
struct vm_area_struct *vma;
spinlock_t *ptl;
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
- pud = pud_offset(pgd, 0xA0000);
+ p4d = p4d_offset(pgd, 0xA0000);
+ if (p4d_none_or_clear_bad(p4d))
+ goto out;
+ pud = pud_offset(p4d, 0xA0000);
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index c74ae9ce8dc4..c8a3b61be0aa 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -146,6 +146,7 @@ SECTIONS
_edata = .;
} :data
+ BUG_TABLE
. = ALIGN(PAGE_SIZE);
__vvar_page = .;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5fba70646c32..5f48f62b8dc2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -741,7 +741,6 @@ static int svm_hardware_enable(void)
struct svm_cpu_data *sd;
uint64_t efer;
- struct desc_ptr gdt_descr;
struct desc_struct *gdt;
int me = raw_smp_processor_id();
@@ -763,8 +762,7 @@ static int svm_hardware_enable(void)
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;
- native_store_gdt(&gdt_descr);
- gdt = (struct desc_struct *)gdt_descr.address;
+ gdt = get_current_gdt_rw();
sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
wrmsrl(MSR_EFER, efer | EFER_SVME);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2ee00dbbbd51..535cc065b844 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -935,7 +935,6 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
* when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
-static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
/*
* We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
@@ -2057,14 +2056,13 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
*/
static unsigned long segment_base(u16 selector)
{
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
struct desc_struct *table;
unsigned long v;
if (!(selector & ~SEGMENT_RPL_MASK))
return 0;
- table = (struct desc_struct *)gdt->address;
+ table = get_current_gdt_ro();
if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
u16 ldt_selector = kvm_read_ldt();
@@ -2169,7 +2167,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
#endif
if (vmx->host_state.msr_host_bndcfgs)
wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
- load_gdt(this_cpu_ptr(&host_gdt));
+ load_fixmap_gdt(raw_smp_processor_id());
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -2271,7 +2269,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
if (!already_loaded) {
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+ void *gdt = get_current_gdt_ro();
unsigned long sysenter_esp;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
@@ -2282,7 +2280,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
vmcs_writel(HOST_TR_BASE,
(unsigned long)this_cpu_ptr(&cpu_tss));
- vmcs_writel(HOST_GDTR_BASE, gdt->address);
+ vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/*
* VM exits change the host TR limit to 0x67 after a VM
@@ -3471,8 +3469,6 @@ static int hardware_enable(void)
ept_sync_global();
}
- native_store_gdt(this_cpu_ptr(&host_gdt));
-
return 0;
}
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 5e2af3a88cf5..81b1635d67de 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -14,20 +14,15 @@
* Zero a page.
* %rdi - page
*/
-ENTRY(clear_page)
-
- ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
- "jmp clear_page_c_e", X86_FEATURE_ERMS
-
+ENTRY(clear_page_rep)
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
ret
-ENDPROC(clear_page)
-EXPORT_SYMBOL(clear_page)
+ENDPROC(clear_page_rep)
+EXPORT_SYMBOL_GPL(clear_page_rep)
ENTRY(clear_page_orig)
-
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
@@ -47,10 +42,12 @@ ENTRY(clear_page_orig)
nop
ret
ENDPROC(clear_page_orig)
+EXPORT_SYMBOL_GPL(clear_page_orig)
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_erms)
movl $4096,%ecx
xorl %eax,%eax
rep stosb
ret
-ENDPROC(clear_page_c_e)
+ENDPROC(clear_page_erms)
+EXPORT_SYMBOL_GPL(clear_page_erms)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 96d2b847e09e..0fbdcb64f9f8 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,7 +2,7 @@
KCOV_INSTRUMENT_tlb.o := n
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
+ pat.o pgtable.o physaddr.o setup_nx.o tlb.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 58b5bee7ea27..9f305be71a72 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = {
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * P4D_LEVEL_MULT)
#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
({ \
@@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st,
}
}
-static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
- unsigned long P)
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
{
int i;
pte_t *start;
pgprotval_t prot;
- start = (pte_t *) pmd_page_vaddr(addr);
+ start = (pte_t *)pmd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PTE; i++) {
prot = pte_flags(*start);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
@@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
#if PTRS_PER_PMD > 1
-static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
- unsigned long P)
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
{
int i;
pmd_t *start;
pgprotval_t prot;
- start = (pmd_t *) pud_page_vaddr(addr);
+ start = (pmd_t *)pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
@@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
}
-static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
- unsigned long P)
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
{
int i;
pud_t *start;
pgprotval_t prot;
pud_t *prev_pud = NULL;
- start = (pud_t *) pgd_page_vaddr(addr);
+ start = (pud_t *)p4d_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
@@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
}
#else
-#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a) pud_none(__pud(pgd_val(a)))
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
+#define p4d_large(a) pud_large(__pud(p4d_val(a)))
+#define p4d_none(a) pud_none(__pud(p4d_val(a)))
+#endif
+
+#if PTRS_PER_P4D > 1
+
+static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
+{
+ int i;
+ p4d_t *start;
+ pgprotval_t prot;
+
+ start = (p4d_t *)pgd_page_vaddr(addr);
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
+ if (!p4d_none(*start)) {
+ if (p4d_large(*start) || !p4d_present(*start)) {
+ prot = p4d_flags(*start);
+ note_page(m, st, __pgprot(prot), 2);
+ } else {
+ walk_pud_level(m, st, *start,
+ P + i * P4D_LEVEL_MULT);
+ }
+ } else
+ note_page(m, st, __pgprot(0), 2);
+
+ start++;
+ }
+}
+
+#else
+#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
+#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
+#define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
#endif
static inline bool is_hypervisor_range(int idx)
@@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
prot = pgd_flags(*start);
note_page(m, &st, __pgprot(prot), 1);
} else {
- walk_pud_level(m, &st, *start,
+ walk_p4d_level(m, &st, *start,
i * PGD_LEVEL_MULT);
}
} else
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 428e31763cb9..8ad91a01cbc8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
{
unsigned index = pgd_index(address);
pgd_t *pgd_k;
+ p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
@@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
/*
* set_pgd(pgd, *pgd_k); here would be useless on PAE
* and redundant with the set_pmd() on non-PAE. As would
- * set_pud.
+ * set_p4d/set_pud.
*/
- pud = pud_offset(pgd, address);
- pud_k = pud_offset(pgd_k, address);
+ p4d = p4d_offset(pgd, address);
+ p4d_k = p4d_offset(pgd_k, address);
+ if (!p4d_present(*p4d_k))
+ return NULL;
+
+ pud = pud_offset(p4d, address);
+ pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
return NULL;
@@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address)
{
pgd_t *base = __va(read_cr3());
pgd_t *pgd = &base[pgd_index(address)];
+ p4d_t *p4d;
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address)
if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
goto out;
#endif
- pmd = pmd_offset(pud_offset(pgd, address), address);
+ p4d = p4d_offset(pgd, address);
+ pud = pud_offset(p4d, address);
+ pmd = pmd_offset(pud, address);
printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
/*
@@ -425,6 +435,7 @@ void vmalloc_sync_all(void)
static noinline int vmalloc_fault(unsigned long address)
{
pgd_t *pgd, *pgd_ref;
+ p4d_t *p4d, *p4d_ref;
pud_t *pud, *pud_ref;
pmd_t *pmd, *pmd_ref;
pte_t *pte, *pte_ref;
@@ -448,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
arch_flush_lazy_mmu_mode();
- } else {
+ } else if (CONFIG_PGTABLE_LEVELS > 4) {
+ /*
+ * With folded p4d, pgd_none() is always false, so the pgd may
+ * point to an empty page table entry and pgd_page_vaddr()
+ * will return garbage.
+ *
+ * We will do the correct sanity check on the p4d level.
+ */
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
}
+ /* With 4-level paging, copying happens on the p4d level. */
+ p4d = p4d_offset(pgd, address);
+ p4d_ref = p4d_offset(pgd_ref, address);
+ if (p4d_none(*p4d_ref))
+ return -1;
+
+ if (p4d_none(*p4d)) {
+ set_p4d(p4d, *p4d_ref);
+ arch_flush_lazy_mmu_mode();
+ } else {
+ BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref));
+ }
+
/*
* Below here mismatches are bugs because these lower tables
* are shared:
*/
- pud = pud_offset(pgd, address);
- pud_ref = pud_offset(pgd_ref, address);
+ pud = pud_offset(p4d, address);
+ pud_ref = pud_offset(p4d_ref, address);
if (pud_none(*pud_ref))
return -1;
@@ -526,6 +557,7 @@ static void dump_pagetable(unsigned long address)
{
pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
pgd_t *pgd = base + pgd_index(address);
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -538,7 +570,15 @@ static void dump_pagetable(unsigned long address)
if (!pgd_present(*pgd))
goto out;
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (bad_address(p4d))
+ goto bad;
+
+ printk("P4D %lx ", p4d_val(*p4d));
+ if (!p4d_present(*p4d) || p4d_large(*p4d))
+ goto out;
+
+ pud = pud_offset(p4d, address);
if (bad_address(pud))
goto bad;
@@ -1082,6 +1122,7 @@ static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -1104,7 +1145,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
if (!pgd_present(*pgd))
return 0;
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (!p4d_present(*p4d))
+ return 0;
+
+ if (p4d_large(*p4d))
+ return spurious_fault_check(error_code, (pte_t *) p4d);
+
+ pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
deleted file mode 100644
index 1f3b6ef105cd..000000000000
--- a/arch/x86/mm/gup.c
+++ /dev/null
@@ -1,475 +0,0 @@
-/*
- * Lockless get_user_pages_fast for x86
- *
- * Copyright (C) 2008 Nick Piggin
- * Copyright (C) 2008 Novell Inc.
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/memremap.h>
-
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-#ifndef CONFIG_X86_PAE
- return READ_ONCE(*ptep);
-#else
- /*
- * With get_user_pages_fast, we walk down the pagetables without taking
- * any locks. For this we would like to load the pointers atomically,
- * but that is not possible (without expensive cmpxchg8b) on PAE. What
- * we do have is the guarantee that a pte will only either go from not
- * present to present, or present to not present or both -- it will not
- * switch to a completely different present page without a TLB flush in
- * between; something that we are blocking by holding interrupts off.
- *
- * Setting ptes from not present to present goes:
- * ptep->pte_high = h;
- * smp_wmb();
- * ptep->pte_low = l;
- *
- * And present to not present goes:
- * ptep->pte_low = 0;
- * smp_wmb();
- * ptep->pte_high = 0;
- *
- * We must ensure here that the load of pte_low sees l iff pte_high
- * sees h. We load pte_high *after* loading pte_low, which ensures we
- * don't see an older value of pte_high. *Then* we recheck pte_low,
- * which ensures that we haven't picked up a changed pte high. We might
- * have got rubbish values from pte_low and pte_high, but we are
- * guaranteed that pte_low will not have the present bit set *unless*
- * it is 'l'. And get_user_pages_fast only operates on present ptes, so
- * we're safe.
- *
- * gup_get_pte should not be used or copied outside gup.c without being
- * very careful -- it does not atomically load the pte or anything that
- * is likely to be useful for you.
- */
- pte_t pte;
-
-retry:
- pte.pte_low = ptep->pte_low;
- smp_rmb();
- pte.pte_high = ptep->pte_high;
- smp_rmb();
- if (unlikely(pte.pte_low != ptep->pte_low))
- goto retry;
-
- return pte;
-#endif
-}
-
-static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
-{
- while ((*nr) - nr_start) {
- struct page *page = pages[--(*nr)];
-
- ClearPageReferenced(page);
- put_page(page);
- }
-}
-
-/*
- * 'pteval' can come from a pte, pmd or pud. We only check
- * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
- * same value on all 3 types.
- */
-static inline int pte_allows_gup(unsigned long pteval, int write)
-{
- unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
-
- if (write)
- need_pte_bits |= _PAGE_RW;
-
- if ((pteval & need_pte_bits) != need_pte_bits)
- return 0;
-
- /* Check memory protection keys permissions. */
- if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
- return 0;
-
- return 1;
-}
-
-/*
- * The performance critical leaf functions are made noinline otherwise gcc
- * inlines everything into a single function which results in too much
- * register pressure.
- */
-static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct dev_pagemap *pgmap = NULL;
- int nr_start = *nr, ret = 0;
- pte_t *ptep, *ptem;
-
- /*
- * Keep the original mapped PTE value (ptem) around since we
- * might increment ptep off the end of the page when finishing
- * our loop iteration.
- */
- ptem = ptep = pte_offset_map(&pmd, addr);
- do {
- pte_t pte = gup_get_pte(ptep);
- struct page *page;
-
- /* Similar to the PMD case, NUMA hinting must take slow path */
- if (pte_protnone(pte))
- break;
-
- if (!pte_allows_gup(pte_val(pte), write))
- break;
-
- if (pte_devmap(pte)) {
- pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
- break;
- }
- } else if (pte_special(pte))
- break;
-
- VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- page = pte_page(pte);
- get_page(page);
- put_dev_pagemap(pgmap);
- SetPageReferenced(page);
- pages[*nr] = page;
- (*nr)++;
-
- } while (ptep++, addr += PAGE_SIZE, addr != end);
- if (addr == end)
- ret = 1;
- pte_unmap(ptem);
-
- return ret;
-}
-
-static inline void get_head_page_multiple(struct page *page, int nr)
-{
- VM_BUG_ON_PAGE(page != compound_head(page), page);
- VM_BUG_ON_PAGE(page_count(page) == 0, page);
- page_ref_add(page, nr);
- SetPageReferenced(page);
-}
-
-static int __gup_device_huge(unsigned long pfn, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- int nr_start = *nr;
- struct dev_pagemap *pgmap = NULL;
-
- do {
- struct page *page = pfn_to_page(pfn);
-
- pgmap = get_dev_pagemap(pfn, pgmap);
- if (unlikely(!pgmap)) {
- undo_dev_pagemap(nr, nr_start, pages);
- return 0;
- }
- SetPageReferenced(page);
- pages[*nr] = page;
- get_page(page);
- put_dev_pagemap(pgmap);
- (*nr)++;
- pfn++;
- } while (addr += PAGE_SIZE, addr != end);
- return 1;
-}
-
-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- unsigned long fault_pfn;
-
- fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-
-static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
- unsigned long end, struct page **pages, int *nr)
-{
- unsigned long fault_pfn;
-
- fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-
-static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct page *head, *page;
- int refs;
-
- if (!pte_allows_gup(pmd_val(pmd), write))
- return 0;
-
- VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
- if (pmd_devmap(pmd))
- return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
-
- /* hugepages are never "special" */
- VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
-
- refs = 0;
- head = pmd_page(pmd);
- page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
- get_head_page_multiple(head, refs);
-
- return 1;
-}
-
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pmd_t *pmdp;
-
- pmdp = pmd_offset(&pud, addr);
- do {
- pmd_t pmd = *pmdp;
-
- next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
- return 0;
- if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
- /*
- * NUMA hinting faults need to be handled in the GUP
- * slowpath for accounting purposes and so that they
- * can be serialised against THP migration.
- */
- if (pmd_protnone(pmd))
- return 0;
- if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
- return 0;
- } else {
- if (!gup_pte_range(pmd, addr, next, write, pages, nr))
- return 0;
- }
- } while (pmdp++, addr = next, addr != end);
-
- return 1;
-}
-
-static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
-{
- struct page *head, *page;
- int refs;
-
- if (!pte_allows_gup(pud_val(pud), write))
- return 0;
-
- VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
- if (pud_devmap(pud))
- return __gup_device_huge_pud(pud, addr, end, pages, nr);
-
- /* hugepages are never "special" */
- VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
-
- refs = 0;
- head = pud_page(pud);
- page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- do {
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
- get_head_page_multiple(head, refs);
-
- return 1;
-}
-
-static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
-{
- unsigned long next;
- pud_t *pudp;
-
- pudp = pud_offset(&pgd, addr);
- do {
- pud_t pud = *pudp;
-
- next = pud_addr_end(addr, end);
- if (pud_none(pud))
- return 0;
- if (unlikely(pud_large(pud))) {
- if (!gup_huge_pud(pud, addr, next, write, pages, nr))
- return 0;
- } else {
- if (!gup_pmd_range(pud, addr, next, write, pages, nr))
- return 0;
- }
- } while (pudp++, addr = next, addr != end);
-
- return 1;
-}
-
-/*
- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
- * back to the regular GUP.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- unsigned long flags;
- pgd_t *pgdp;
- int nr = 0;
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- (void __user *)start, len)))
- return 0;
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables and pages from being freed on x86.
- *
- * So long as we atomically load page table pointers versus teardown
- * (which we do on x86, with the above PAE exception), we can follow the
- * address down to the the page and take a ref on it.
- */
- local_irq_save(flags);
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- break;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
- break;
- } while (pgdp++, addr = next, addr != end);
- local_irq_restore(flags);
-
- return nr;
-}
-
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, len, end;
- unsigned long next;
- pgd_t *pgdp;
- int nr = 0;
-
- start &= PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
-
- end = start + len;
- if (end < start)
- goto slow_irqon;
-
-#ifdef CONFIG_X86_64
- if (end >> __VIRTUAL_MASK_SHIFT)
- goto slow_irqon;
-#endif
-
- /*
- * XXX: batch / limit 'nr', to avoid large irq off latency
- * needs some instrumenting to determine the common sizes used by
- * important workloads (eg. DB2), and whether limiting the batch size
- * will decrease performance.
- *
- * It seems like we're in the clear for the moment. Direct-IO is
- * the main guy that batches up lots of get_user_pages, and even
- * they are limited to 64-at-a-time which is not so many.
- */
- /*
- * This doesn't prevent pagetable teardown, but does prevent
- * the pagetables and pages from being freed on x86.
- *
- * So long as we atomically load page table pointers versus teardown
- * (which we do on x86, with the above PAE exception), we can follow the
- * address down to the the page and take a ref on it.
- */
- local_irq_disable();
- pgdp = pgd_offset(mm, addr);
- do {
- pgd_t pgd = *pgdp;
-
- next = pgd_addr_end(addr, end);
- if (pgd_none(pgd))
- goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
- goto slow;
- } while (pgdp++, addr = next, addr != end);
- local_irq_enable();
-
- VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
- return nr;
-
- {
- int ret;
-
-slow:
- local_irq_enable();
-slow_irqon:
- /* Try to get the remaining pages with get_user_pages */
- start += nr << PAGE_SHIFT;
- pages += nr;
-
- ret = get_user_pages_unlocked(start,
- (end - start) >> PAGE_SHIFT,
- pages, write ? FOLL_WRITE : 0);
-
- /* Have to be a bit careful with return values */
- if (nr > 0) {
- if (ret < 0)
- ret = nr;
- else
- ret += nr;
- }
-
- return ret;
- }
-}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index c5066a260803..302f43fd9c28 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -12,10 +12,12 @@
#include <linux/pagemap.h>
#include <linux/err.h>
#include <linux/sysctl.h>
+#include <linux/compat.h>
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
+#include <asm/elf.h>
#if 0 /* This is just for testing */
struct page *
@@ -82,8 +84,9 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
info.flags = 0;
info.length = len;
- info.low_limit = current->mm->mmap_legacy_base;
- info.high_limit = TASK_SIZE;
+ info.low_limit = get_mmap_base(1);
+ info.high_limit = in_compat_syscall() ?
+ tasksize_32bit() : tasksize_64bit();
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
@@ -100,7 +103,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
- info.high_limit = current->mm->mmap_base;
+ info.high_limit = get_mmap_base(0);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 4473cb4f8b90..04210a29dd60 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
return 0;
}
+static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long next;
+
+ for (; addr < end; addr = next) {
+ p4d_t *p4d = p4d_page + p4d_index(addr);
+ pud_t *pud;
+
+ next = (addr & P4D_MASK) + P4D_SIZE;
+ if (next > end)
+ next = end;
+
+ if (p4d_present(*p4d)) {
+ pud = pud_offset(p4d, 0);
+ ident_pud_init(info, pud, addr, next);
+ continue;
+ }
+ pud = (pud_t *)info->alloc_pgt_page(info->context);
+ if (!pud)
+ return -ENOMEM;
+ ident_pud_init(info, pud, addr, next);
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ }
+
+ return 0;
+}
+
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
unsigned long pstart, unsigned long pend)
{
@@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
for (; addr < end; addr = next) {
pgd_t *pgd = pgd_page + pgd_index(addr);
- pud_t *pud;
+ p4d_t *p4d;
next = (addr & PGDIR_MASK) + PGDIR_SIZE;
if (next > end)
next = end;
if (pgd_present(*pgd)) {
- pud = pud_offset(pgd, 0);
- result = ident_pud_init(info, pud, addr, next);
+ p4d = p4d_offset(pgd, 0);
+ result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
continue;
}
- pud = (pud_t *)info->alloc_pgt_page(info->context);
- if (!pud)
+ p4d = (p4d_t *)info->alloc_pgt_page(info->context);
+ if (!p4d)
return -ENOMEM;
- result = ident_pud_init(info, pud, addr, next);
+ result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ } else {
+ /*
+ * With p4d folded, pgd is equal to p4d.
+ * The pgd entry has to point to the pud page table in this case.
+ */
+ pud_t *pud = pud_offset(p4d, 0);
+ set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
}
return 0;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2b4b53e6793f..601b8e04e5c6 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -56,8 +56,6 @@
unsigned long highstart_pfn, highend_pfn;
-static noinline int do_test_wp_bit(void);
-
bool __read_mostly __vmalloc_start_set = false;
/*
@@ -67,6 +65,7 @@ bool __read_mostly __vmalloc_start_set = false;
*/
static pmd_t * __init one_md_table_init(pgd_t *pgd)
{
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd_table;
@@ -75,13 +74,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
- pud = pud_offset(pgd, 0);
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
return pmd_table;
}
#endif
- pud = pud_offset(pgd, 0);
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
pmd_table = pmd_offset(pud, 0);
return pmd_table;
@@ -390,8 +391,11 @@ pte_t *kmap_pte;
static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
{
- return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
- vaddr), vaddr), vaddr);
+ pgd_t *pgd = pgd_offset_k(vaddr);
+ p4d_t *p4d = p4d_offset(pgd, vaddr);
+ pud_t *pud = pud_offset(p4d, vaddr);
+ pmd_t *pmd = pmd_offset(pud, vaddr);
+ return pte_offset_kernel(pmd, vaddr);
}
static void __init kmap_init(void)
@@ -410,6 +414,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
unsigned long vaddr;
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -418,7 +423,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
pgd = swapper_pg_dir + pgd_index(vaddr);
- pud = pud_offset(pgd, vaddr);
+ p4d = p4d_offset(pgd, vaddr);
+ pud = pud_offset(p4d, vaddr);
pmd = pmd_offset(pud, vaddr);
pte = pte_offset_kernel(pmd, vaddr);
pkmap_page_table = pte;
@@ -450,6 +456,7 @@ void __init native_pagetable_init(void)
{
unsigned long pfn, va;
pgd_t *pgd, *base = swapper_pg_dir;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -469,7 +476,8 @@ void __init native_pagetable_init(void)
if (!pgd_present(*pgd))
break;
- pud = pud_offset(pgd, va);
+ p4d = p4d_offset(pgd, va);
+ pud = pud_offset(p4d, va);
pmd = pmd_offset(pud, va);
if (!pmd_present(*pmd))
break;
@@ -716,20 +724,20 @@ void __init paging_init(void)
*/
static void __init test_wp_bit(void)
{
- printk(KERN_INFO
- "Checking if this processor honours the WP bit even in supervisor mode...");
+ char z = 0;
+
+ printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode...");
- /* Any page-aligned address will do, the test is non-destructive */
- __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);
- boot_cpu_data.wp_works_ok = do_test_wp_bit();
- clear_fixmap(FIX_WP_TEST);
+ __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO);
- if (!boot_cpu_data.wp_works_ok) {
- printk(KERN_CONT "No.\n");
- panic("Linux doesn't support CPUs with broken WP.");
- } else {
+ if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) {
+ clear_fixmap(FIX_WP_TEST);
printk(KERN_CONT "Ok.\n");
+ return;
}
+
+ printk(KERN_CONT "No.\n");
+ panic("Linux doesn't support CPUs with broken WP.");
}
void __init mem_init(void)
@@ -811,8 +819,7 @@ void __init mem_init(void)
BUG_ON(VMALLOC_START >= VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
- if (boot_cpu_data.wp_works_ok < 0)
- test_wp_bit();
+ test_wp_bit();
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -840,30 +847,6 @@ int arch_remove_memory(u64 start, u64 size)
#endif
#endif
-/*
- * This function cannot be __init, since exceptions don't work in that
- * section. Put this after the callers, so that it cannot be inlined.
- */
-static noinline int do_test_wp_bit(void)
-{
- char tmp_reg;
- int flag;
-
- __asm__ __volatile__(
- " movb %0, %1 \n"
- "1: movb %1, %0 \n"
- " xorl %2, %2 \n"
- "2: \n"
- _ASM_EXTABLE(1b,2b)
- :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
- "=q" (tmp_reg),
- "=r" (flag)
- :"2" (1)
- :"memory");
-
- return flag;
-}
-
int kernel_set_to_readonly __read_mostly;
void set_kernel_text_rw(void)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 15173d37f399..7bdda6f1d135 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -97,28 +97,38 @@ void sync_global_pgds(unsigned long start, unsigned long end)
unsigned long address;
for (address = start; address <= end; address += PGDIR_SIZE) {
- const pgd_t *pgd_ref = pgd_offset_k(address);
+ pgd_t *pgd_ref = pgd_offset_k(address);
+ const p4d_t *p4d_ref;
struct page *page;
- if (pgd_none(*pgd_ref))
+ /*
+ * With folded p4d, pgd_none() is always false, we need to
+ * handle synchonization on p4d level.
+ */
+ BUILD_BUG_ON(pgd_none(*pgd_ref));
+ p4d_ref = p4d_offset(pgd_ref, address);
+
+ if (p4d_none(*p4d_ref))
continue;
spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
+ p4d_t *p4d;
spinlock_t *pgt_lock;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
+ p4d = p4d_offset(pgd, address);
/* the pgt_lock only for Xen */
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
- if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
- BUG_ON(pgd_page_vaddr(*pgd)
- != pgd_page_vaddr(*pgd_ref));
+ if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
+ BUG_ON(p4d_page_vaddr(*p4d)
+ != p4d_page_vaddr(*p4d_ref));
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
+ if (p4d_none(*p4d))
+ set_p4d(p4d, *p4d_ref);
spin_unlock(pgt_lock);
}
@@ -149,16 +159,28 @@ static __ref void *spp_getpage(void)
return ptr;
}
-static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
+static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
{
if (pgd_none(*pgd)) {
- pud_t *pud = (pud_t *)spp_getpage();
- pgd_populate(&init_mm, pgd, pud);
- if (pud != pud_offset(pgd, 0))
+ p4d_t *p4d = (p4d_t *)spp_getpage();
+ pgd_populate(&init_mm, pgd, p4d);
+ if (p4d != p4d_offset(pgd, 0))
printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
- pud, pud_offset(pgd, 0));
+ p4d, p4d_offset(pgd, 0));
+ }
+ return p4d_offset(pgd, vaddr);
+}
+
+static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
+{
+ if (p4d_none(*p4d)) {
+ pud_t *pud = (pud_t *)spp_getpage();
+ p4d_populate(&init_mm, p4d, pud);
+ if (pud != pud_offset(p4d, 0))
+ printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+ pud, pud_offset(p4d, 0));
}
- return pud_offset(pgd, vaddr);
+ return pud_offset(p4d, vaddr);
}
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
@@ -167,7 +189,7 @@ static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
pmd_t *pmd = (pmd_t *) spp_getpage();
pud_populate(&init_mm, pud, pmd);
if (pmd != pmd_offset(pud, 0))
- printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+ printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
pmd, pmd_offset(pud, 0));
}
return pmd_offset(pud, vaddr);
@@ -179,20 +201,15 @@ static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
pte_t *pte = (pte_t *) spp_getpage();
pmd_populate_kernel(&init_mm, pmd, pte);
if (pte != pte_offset_kernel(pmd, 0))
- printk(KERN_ERR "PAGETABLE BUG #02!\n");
+ printk(KERN_ERR "PAGETABLE BUG #03!\n");
}
return pte_offset_kernel(pmd, vaddr);
}
-void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pud = pud_page + pud_index(vaddr);
- pmd = fill_pmd(pud, vaddr);
- pte = fill_pte(pmd, vaddr);
+ pmd_t *pmd = fill_pmd(pud, vaddr);
+ pte_t *pte = fill_pte(pmd, vaddr);
set_pte(pte, new_pte);
@@ -203,10 +220,25 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
__flush_tlb_one(vaddr);
}
+void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
+{
+ p4d_t *p4d = p4d_page + p4d_index(vaddr);
+ pud_t *pud = fill_pud(p4d, vaddr);
+
+ __set_pte_vaddr(pud, vaddr, new_pte);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+ pud_t *pud = pud_page + pud_index(vaddr);
+
+ __set_pte_vaddr(pud, vaddr, new_pte);
+}
+
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
- pud_t *pud_page;
+ p4d_t *p4d_page;
pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
@@ -216,17 +248,20 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
"PGD FIXMAP MISSING, it should be setup in head.S!\n");
return;
}
- pud_page = (pud_t*)pgd_page_vaddr(*pgd);
- set_pte_vaddr_pud(pud_page, vaddr, pteval);
+
+ p4d_page = p4d_offset(pgd, 0);
+ set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
}
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset_k(vaddr);
- pud = fill_pud(pgd, vaddr);
+ p4d = fill_p4d(pgd, vaddr);
+ pud = fill_pud(p4d, vaddr);
return fill_pmd(pud, vaddr);
}
@@ -245,6 +280,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
enum page_cache_mode cache)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgprot_t prot;
@@ -255,11 +291,17 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
pgd = pgd_offset_k((unsigned long)__va(phys));
if (pgd_none(*pgd)) {
+ p4d = (p4d_t *) spp_getpage();
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
+ _PAGE_USER));
+ }
+ p4d = p4d_offset(pgd, (unsigned long)__va(phys));
+ if (p4d_none(*p4d)) {
pud = (pud_t *) spp_getpage();
- set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
_PAGE_USER));
}
- pud = pud_offset(pgd, (unsigned long)__va(phys));
+ pud = pud_offset(p4d, (unsigned long)__va(phys));
if (pud_none(*pud)) {
pmd = (pmd_t *) spp_getpage();
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
@@ -563,12 +605,15 @@ kernel_physical_mapping_init(unsigned long paddr_start,
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
pgd_t *pgd = pgd_offset_k(vaddr);
+ p4d_t *p4d;
pud_t *pud;
vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
- if (pgd_val(*pgd)) {
- pud = (pud_t *)pgd_page_vaddr(*pgd);
+ BUILD_BUG_ON(pgd_none(*pgd));
+ p4d = p4d_offset(pgd, vaddr);
+ if (p4d_val(*p4d)) {
+ pud = (pud_t *)p4d_page_vaddr(*p4d);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
@@ -580,7 +625,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
page_size_mask);
spin_lock(&init_mm.page_table_lock);
- pgd_populate(&init_mm, pgd, pud);
+ p4d_populate(&init_mm, p4d, pud);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@@ -726,6 +771,24 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
spin_unlock(&init_mm.page_table_lock);
}
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
+
+ /* free a pud talbe */
+ free_pagetable(p4d_page(*p4d), 0);
+ spin_lock(&init_mm.page_table_lock);
+ p4d_clear(p4d);
+ spin_unlock(&init_mm.page_table_lock);
+}
+
static void __meminit
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
bool direct)
@@ -908,6 +971,32 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
update_page_count(PG_LEVEL_1G, -pages);
}
+static void __meminit
+remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
+ bool direct)
+{
+ unsigned long next, pages = 0;
+ pud_t *pud_base;
+ p4d_t *p4d;
+
+ p4d = p4d_start + p4d_index(addr);
+ for (; addr < end; addr = next, p4d++) {
+ next = p4d_addr_end(addr, end);
+
+ if (!p4d_present(*p4d))
+ continue;
+
+ BUILD_BUG_ON(p4d_large(*p4d));
+
+ pud_base = (pud_t *)p4d_page_vaddr(*p4d);
+ remove_pud_table(pud_base, addr, next, direct);
+ free_pud_table(pud_base, p4d);
+ }
+
+ if (direct)
+ update_page_count(PG_LEVEL_512G, -pages);
+}
+
/* start and end are both virtual address. */
static void __meminit
remove_pagetable(unsigned long start, unsigned long end, bool direct)
@@ -915,7 +1004,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
unsigned long next;
unsigned long addr;
pgd_t *pgd;
- pud_t *pud;
+ p4d_t *p4d;
for (addr = start; addr < end; addr = next) {
next = pgd_addr_end(addr, end);
@@ -924,8 +1013,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
if (!pgd_present(*pgd))
continue;
- pud = (pud_t *)pgd_page_vaddr(*pgd);
- remove_pud_table(pud, addr, next, direct);
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ remove_p4d_table(p4d, addr, next, direct);
}
flush_tlb_all();
@@ -1090,6 +1179,7 @@ int kern_addr_valid(unsigned long addr)
{
unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -1101,7 +1191,11 @@ int kern_addr_valid(unsigned long addr)
if (pgd_none(*pgd))
return 0;
- pud = pud_offset(pgd, addr);
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return 0;
+
+ pud = pud_offset(p4d, addr);
if (pud_none(*pud))
return 0;
@@ -1158,6 +1252,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
unsigned long addr;
unsigned long next;
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@@ -1168,7 +1263,11 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
if (!pgd)
return -ENOMEM;
- pud = vmemmap_pud_populate(pgd, addr, node);
+ p4d = vmemmap_p4d_populate(pgd, addr, node);
+ if (!p4d)
+ return -ENOMEM;
+
+ pud = vmemmap_pud_populate(p4d, addr, node);
if (!pud)
return -ENOMEM;
@@ -1236,6 +1335,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
unsigned long end = (unsigned long)(start_page + size);
unsigned long next;
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
unsigned int nr_pages;
@@ -1251,7 +1351,14 @@ void register_page_bootmem_memmap(unsigned long section_nr,
}
get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
- pud = pud_offset(pgd, addr);
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ continue;
+ }
+ get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);
+
+ pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
continue;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 7aaa2635862d..a5e1cda85974 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -425,7 +425,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
/* Don't assume we're using swapper_pg_dir at this point */
pgd_t *base = __va(read_cr3());
pgd_t *pgd = &base[pgd_index(addr)];
- pud_t *pud = pud_offset(pgd, addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
pmd_t *pmd = pmd_offset(pud, addr);
return pmd;
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 4c90cfdc128b..47efdcfe7113 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -33,8 +33,19 @@ static int __init map_range(struct range *range)
static void __init clear_pgds(unsigned long start,
unsigned long end)
{
- for (; start < end; start += PGDIR_SIZE)
- pgd_clear(pgd_offset_k(start));
+ pgd_t *pgd;
+
+ for (; start < end; start += PGDIR_SIZE) {
+ pgd = pgd_offset_k(start);
+ /*
+ * With folded p4d, pgd_clear() is nop, use p4d_clear()
+ * instead.
+ */
+ if (CONFIG_PGTABLE_LEVELS < 5)
+ p4d_clear(p4d_offset(pgd, start));
+ else
+ pgd_clear(pgd);
+ }
}
static void __init kasan_map_early_shadow(pgd_t *pgd)
@@ -44,8 +55,18 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
unsigned long end = KASAN_SHADOW_END;
for (i = pgd_index(start); start < end; i++) {
- pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
- | _KERNPG_TABLE);
+ switch (CONFIG_PGTABLE_LEVELS) {
+ case 4:
+ pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
+ _KERNPG_TABLE);
+ break;
+ case 5:
+ pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
+ _KERNPG_TABLE);
+ break;
+ default:
+ BUILD_BUG();
+ }
start += PGDIR_SIZE;
}
}
@@ -73,6 +94,7 @@ void __init kasan_early_init(void)
pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
+ p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
for (i = 0; i < PTRS_PER_PTE; i++)
kasan_zero_pte[i] = __pte(pte_val);
@@ -83,6 +105,9 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
+ for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
+ kasan_zero_p4d[i] = __p4d(p4d_val);
+
kasan_map_early_shadow(early_level4_pgt);
kasan_map_early_shadow(init_level4_pgt);
}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 7940166c799b..19ad095b41df 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -30,30 +30,44 @@
#include <linux/limits.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
+#include <linux/compat.h>
#include <asm/elf.h>
struct va_alignment __read_mostly va_align = {
.flags = -1,
};
-static unsigned long stack_maxrandom_size(void)
+unsigned long tasksize_32bit(void)
+{
+ return IA32_PAGE_OFFSET;
+}
+
+unsigned long tasksize_64bit(void)
+{
+ return TASK_SIZE_MAX;
+}
+
+static unsigned long stack_maxrandom_size(unsigned long task_size)
{
unsigned long max = 0;
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
- max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT;
+ max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
+ max <<= PAGE_SHIFT;
}
return max;
}
-/*
- * Top of mmap area (just below the process stack).
- *
- * Leave an at least ~128 MB hole with possible stack randomization.
- */
-#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
-#define MAX_GAP (TASK_SIZE/6*5)
+#ifdef CONFIG_COMPAT
+# define mmap32_rnd_bits mmap_rnd_compat_bits
+# define mmap64_rnd_bits mmap_rnd_bits
+#else
+# define mmap32_rnd_bits mmap_rnd_bits
+# define mmap64_rnd_bits mmap_rnd_bits
+#endif
+
+#define SIZE_128M (128 * 1024 * 1024UL)
static int mmap_is_legacy(void)
{
@@ -66,54 +80,91 @@ static int mmap_is_legacy(void)
return sysctl_legacy_va_layout;
}
-unsigned long arch_mmap_rnd(void)
+static unsigned long arch_rnd(unsigned int rndbits)
{
- unsigned long rnd;
-
- if (mmap_is_ia32())
-#ifdef CONFIG_COMPAT
- rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
-#else
- rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
-#endif
- else
- rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
+ return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
+}
- return rnd << PAGE_SHIFT;
+unsigned long arch_mmap_rnd(void)
+{
+ if (!(current->flags & PF_RANDOMIZE))
+ return 0;
+ return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
}
-static unsigned long mmap_base(unsigned long rnd)
+static unsigned long mmap_base(unsigned long rnd, unsigned long task_size)
{
unsigned long gap = rlimit(RLIMIT_STACK);
+ unsigned long gap_min, gap_max;
+
+ /*
+ * Top of mmap area (just below the process stack).
+ * Leave an at least ~128 MB hole with possible stack randomization.
+ */
+ gap_min = SIZE_128M + stack_maxrandom_size(task_size);
+ gap_max = (task_size / 6) * 5;
- if (gap < MIN_GAP)
- gap = MIN_GAP;
- else if (gap > MAX_GAP)
- gap = MAX_GAP;
+ if (gap < gap_min)
+ gap = gap_min;
+ else if (gap > gap_max)
+ gap = gap_max;
+
+ return PAGE_ALIGN(task_size - gap - rnd);
+}
- return PAGE_ALIGN(TASK_SIZE - gap - rnd);
+static unsigned long mmap_legacy_base(unsigned long rnd,
+ unsigned long task_size)
+{
+ return __TASK_UNMAPPED_BASE(task_size) + rnd;
}
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
+static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
+ unsigned long random_factor, unsigned long task_size)
+{
+ *legacy_base = mmap_legacy_base(random_factor, task_size);
+ if (mmap_is_legacy())
+ *base = *legacy_base;
+ else
+ *base = mmap_base(random_factor, task_size);
+}
+
void arch_pick_mmap_layout(struct mm_struct *mm)
{
- unsigned long random_factor = 0UL;
+ if (mmap_is_legacy())
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ else
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
- if (current->flags & PF_RANDOMIZE)
- random_factor = arch_mmap_rnd();
+ arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
+ arch_rnd(mmap64_rnd_bits), tasksize_64bit());
+
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+ /*
+ * The mmap syscall mapping base decision depends solely on the
+ * syscall type (64-bit or compat). This applies for 64bit
+ * applications and 32bit applications. The 64bit syscall uses
+ * mmap_base, the compat syscall uses mmap_compat_base.
+ */
+ arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
+ arch_rnd(mmap32_rnd_bits), tasksize_32bit());
+#endif
+}
- mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor;
+unsigned long get_mmap_base(int is_legacy)
+{
+ struct mm_struct *mm = current->mm;
- if (mmap_is_legacy()) {
- mm->mmap_base = mm->mmap_legacy_base;
- mm->get_unmapped_area = arch_get_unmapped_area;
- } else {
- mm->mmap_base = mmap_base(random_factor);
- mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+ if (in_compat_syscall()) {
+ return is_legacy ? mm->mmap_compat_legacy_base
+ : mm->mmap_compat_base;
}
+#endif
+ return is_legacy ? mm->mmap_legacy_base : mm->mmap_base;
}
const char *arch_vma_name(struct vm_area_struct *vma)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 12dcad7297a5..175f54ac6772 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -201,7 +201,7 @@ static void __init alloc_node_data(int nid)
nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
MEMBLOCK_ALLOC_ACCESSIBLE);
if (!nd_pa) {
- pr_err("Cannot find %zu bytes in node %d\n",
+ pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
nd_size, nid);
return;
}
@@ -225,7 +225,7 @@ static void __init alloc_node_data(int nid)
* numa_cleanup_meminfo - Cleanup a numa_meminfo
* @mi: numa_meminfo to clean up
*
- * Sanitize @mi by merging and removing unncessary memblks. Also check for
+ * Sanitize @mi by merging and removing unnecessary memblks. Also check for
* conflicts and clear unused memblks.
*
* RETURNS:
@@ -314,20 +314,6 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
return 0;
}
-/*
- * Set nodes, which have memory in @mi, in *@nodemask.
- */
-static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
- const struct numa_meminfo *mi)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
- if (mi->blk[i].start != mi->blk[i].end &&
- mi->blk[i].nid != NUMA_NO_NODE)
- node_set(mi->blk[i].nid, *nodemask);
-}
-
/**
* numa_reset_distance - Reset NUMA distance table
*
@@ -347,16 +333,12 @@ void __init numa_reset_distance(void)
static int __init numa_alloc_distance(void)
{
- nodemask_t nodes_parsed;
size_t size;
int i, j, cnt = 0;
u64 phys;
/* size the new table and allocate it */
- nodes_parsed = numa_nodes_parsed;
- numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
-
- for_each_node_mask(i, nodes_parsed)
+ for_each_node_mask(i, numa_nodes_parsed)
cnt = i;
cnt++;
size = cnt * cnt * sizeof(numa_distance[0]);
@@ -535,7 +517,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
/* Account for nodes with cpus and no memory */
node_possible_map = numa_nodes_parsed;
- numa_nodemask_from_meminfo(&node_possible_map, mi);
if (WARN_ON(nodes_empty(node_possible_map)))
return -EINVAL;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 28d42130243c..b5949017dead 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -346,6 +346,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level)
{
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@@ -354,7 +355,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
if (pgd_none(*pgd))
return NULL;
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d))
+ return NULL;
+
+ *level = PG_LEVEL_512G;
+ if (p4d_large(*p4d) || !p4d_present(*p4d))
+ return (pte_t *)p4d;
+
+ pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
@@ -406,13 +415,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
pmd_t *lookup_pmd_address(unsigned long address)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset_k(address);
if (pgd_none(*pgd))
return NULL;
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
+ return NULL;
+
+ pud = pud_offset(p4d, address);
if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
return NULL;
@@ -477,11 +491,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
- pud = pud_offset(pgd, address);
+ p4d = p4d_offset(pgd, address);
+ pud = pud_offset(p4d, address);
pmd = pmd_offset(pud, address);
set_pte_atomic((pte_t *)pmd, pte);
}
@@ -836,9 +852,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
pud_clear(pud);
}
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
{
- pud_t *pud = pud_offset(pgd, start);
+ pud_t *pud = pud_offset(p4d, start);
/*
* Not on a GB page boundary?
@@ -1004,8 +1020,8 @@ static long populate_pmd(struct cpa_data *cpa,
return num_pages;
}
-static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
- pgprot_t pgprot)
+static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
+ pgprot_t pgprot)
{
pud_t *pud;
unsigned long end;
@@ -1026,7 +1042,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
cur_pages = (pre_end - start) >> PAGE_SHIFT;
cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
- pud = pud_offset(pgd, start);
+ pud = pud_offset(p4d, start);
/*
* Need a PMD page?
@@ -1047,7 +1063,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
if (cpa->numpages == cur_pages)
return cur_pages;
- pud = pud_offset(pgd, start);
+ pud = pud_offset(p4d, start);
pud_pgprot = pgprot_4k_2_large(pgprot);
/*
@@ -1067,7 +1083,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
if (start < end) {
long tmp;
- pud = pud_offset(pgd, start);
+ pud = pud_offset(p4d, start);
if (pud_none(*pud))
if (alloc_pmd_page(pud))
return -1;
@@ -1090,33 +1106,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
{
pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
pud_t *pud = NULL; /* shut up gcc */
+ p4d_t *p4d;
pgd_t *pgd_entry;
long ret;
pgd_entry = cpa->pgd + pgd_index(addr);
+ if (pgd_none(*pgd_entry)) {
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ if (!p4d)
+ return -1;
+
+ set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+
/*
* Allocate a PUD page and hand it down for mapping.
*/
- if (pgd_none(*pgd_entry)) {
+ p4d = p4d_offset(pgd_entry, addr);
+ if (p4d_none(*p4d)) {
pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
if (!pud)
return -1;
- set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
- ret = populate_pud(cpa, addr, pgd_entry, pgprot);
+ ret = populate_pud(cpa, addr, p4d, pgprot);
if (ret < 0) {
/*
* Leave the PUD page in place in case some other CPU or thread
* already found it, but remove any useless entries we just
* added to it.
*/
- unmap_pud_range(pgd_entry, addr,
+ unmap_pud_range(p4d, addr,
addr + (cpa->numpages << PAGE_SHIFT));
return ret;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 6cbdff26bb96..508a708eb9a6 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
+
+#if CONFIG_PGTABLE_LEVELS > 4
+void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
+{
+ paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(p4d));
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
@@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
references from swapper_pg_dir. */
if (CONFIG_PGTABLE_LEVELS == 2 ||
(CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
- CONFIG_PGTABLE_LEVELS == 4) {
+ CONFIG_PGTABLE_LEVELS >= 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
@@ -261,13 +269,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
+ p4d_t *p4d;
pud_t *pud;
int i;
if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
return;
- pud = pud_offset(pgd, 0);
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
pmd_t *pmd = pmds[i];
@@ -580,6 +590,28 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
}
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+#ifdef CONFIG_X86_5LEVEL
+/**
+ * p4d_set_huge - setup kernel P4D mapping
+ *
+ * No 512GB pages yet -- always return 0
+ */
+int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
+{
+ return 0;
+}
+
+/**
+ * p4d_clear_huge - clear kernel P4D mapping when it is set
+ *
+ * No 512GB pages yet -- always return 0
+ */
+int p4d_clear_huge(p4d_t *p4d)
+{
+ return 0;
+}
+#endif
+
/**
* pud_set_huge - setup kernel PUD mapping
*
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 9adce776852b..3d275a791c76 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20;
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
BUG();
return;
}
- pud = pud_offset(pgd, vaddr);
+ p4d = p4d_offset(pgd, vaddr);
+ if (p4d_none(*p4d)) {
+ BUG();
+ return;
+ }
+ pud = pud_offset(p4d, vaddr);
if (pud_none(*pud)) {
BUG();
return;
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index 066619b0700c..f1d83b34c329 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,6 +1,5 @@
OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
-obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o
obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index cef39b097649..3481268da3d0 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -68,7 +68,7 @@ pgd_t * __init efi_call_phys_prolog(void)
load_cr3(initial_page_table);
__flush_tlb_all();
- gdt_descr.address = __pa(get_cpu_gdt_table(0));
+ gdt_descr.address = get_cpu_gdt_paddr(0);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
@@ -79,7 +79,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
{
struct desc_ptr gdt_descr;
- gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
+ gdt_descr.address = (unsigned long)get_cpu_gdt_rw(0);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index a4695da42d77..7fdf11aa50c9 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -47,7 +47,7 @@
#include <asm/pgalloc.h>
/*
- * We allocate runtime services regions bottom-up, starting from -4G, i.e.
+ * We allocate runtime services regions top-down, starting from -4G, i.e.
* 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
*/
static u64 efi_va = EFI_VA_START;
@@ -135,6 +135,7 @@ static pgd_t *efi_pgd;
int __init efi_alloc_page_tables(void)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
gfp_t gfp_mask;
@@ -147,15 +148,20 @@ int __init efi_alloc_page_tables(void)
return -ENOMEM;
pgd = efi_pgd + pgd_index(EFI_VA_END);
+ p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END);
+ if (!p4d) {
+ free_page((unsigned long)efi_pgd);
+ return -ENOMEM;
+ }
- pud = pud_alloc_one(NULL, 0);
+ pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
if (!pud) {
+ if (CONFIG_PGTABLE_LEVELS > 4)
+ free_page((unsigned long) pgd_page_vaddr(*pgd));
free_page((unsigned long)efi_pgd);
return -ENOMEM;
}
- pgd_populate(NULL, pgd, pud);
-
return 0;
}
@@ -166,6 +172,7 @@ void efi_sync_low_kernel_mappings(void)
{
unsigned num_entries;
pgd_t *pgd_k, *pgd_efi;
+ p4d_t *p4d_k, *p4d_efi;
pud_t *pud_k, *pud_efi;
if (efi_enabled(EFI_OLD_MEMMAP))
@@ -190,23 +197,37 @@ void efi_sync_low_kernel_mappings(void)
memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
/*
+ * As with PGDs, we share all P4D entries apart from the one entry
+ * that covers the EFI runtime mapping space.
+ */
+ BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END));
+ BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK));
+
+ pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
+ pgd_k = pgd_offset_k(EFI_VA_END);
+ p4d_efi = p4d_offset(pgd_efi, 0);
+ p4d_k = p4d_offset(pgd_k, 0);
+
+ num_entries = p4d_index(EFI_VA_END);
+ memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries);
+
+ /*
* We share all the PUD entries apart from those that map the
* EFI regions. Copy around them.
*/
BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0);
BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0);
- pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
- pud_efi = pud_offset(pgd_efi, 0);
-
- pgd_k = pgd_offset_k(EFI_VA_END);
- pud_k = pud_offset(pgd_k, 0);
+ p4d_efi = p4d_offset(pgd_efi, EFI_VA_END);
+ p4d_k = p4d_offset(pgd_k, EFI_VA_END);
+ pud_efi = pud_offset(p4d_efi, 0);
+ pud_k = pud_offset(p4d_k, 0);
num_entries = pud_index(EFI_VA_END);
memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
- pud_efi = pud_offset(pgd_efi, EFI_VA_START);
- pud_k = pud_offset(pgd_k, EFI_VA_START);
+ pud_efi = pud_offset(p4d_efi, EFI_VA_START);
+ pud_k = pud_offset(p4d_k, EFI_VA_START);
num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START);
memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index 3dbde04febdc..53e0235e308f 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -2,8 +2,9 @@
obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o
# SDHCI Devices
obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o
-# WiFi
+# WiFi + BT
obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o
+obj-$(subst m,y,$(CONFIG_BT_HCIUART_BCM)) += platform_bt.o
# IPC Devices
obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o
obj-$(subst m,y,$(CONFIG_SND_MFLD_MACHINE)) += platform_msic_audio.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bt.c b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
new file mode 100644
index 000000000000..5a0483e7bf66
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bt.c
@@ -0,0 +1,108 @@
+/*
+ * Bluetooth platform data initialization file
+ *
+ * (C) Copyright 2017 Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio/machine.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+#include <asm/intel-mid.h>
+
+struct bt_sfi_data {
+ struct device *dev;
+ const char *name;
+ int (*setup)(struct bt_sfi_data *ddata);
+};
+
+static struct gpiod_lookup_table tng_bt_sfi_gpio_table = {
+ .dev_id = "hci_bcm",
+ .table = {
+ GPIO_LOOKUP("0000:00:0c.0", -1, "device-wakeup", GPIO_ACTIVE_HIGH),
+ GPIO_LOOKUP("0000:00:0c.0", -1, "shutdown", GPIO_ACTIVE_HIGH),
+ GPIO_LOOKUP("0000:00:0c.0", -1, "host-wakeup", GPIO_ACTIVE_HIGH),
+ { },
+ },
+};
+
+#define TNG_BT_SFI_GPIO_DEVICE_WAKEUP "bt_wakeup"
+#define TNG_BT_SFI_GPIO_SHUTDOWN "BT-reset"
+#define TNG_BT_SFI_GPIO_HOST_WAKEUP "bt_uart_enable"
+
+static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata)
+{
+ struct gpiod_lookup_table *table = &tng_bt_sfi_gpio_table;
+ struct gpiod_lookup *lookup = table->table;
+ struct pci_dev *pdev;
+
+ /* Connected to /dev/ttyS0 */
+ pdev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(4, 1));
+ if (!pdev)
+ return -ENODEV;
+
+ ddata->dev = &pdev->dev;
+ ddata->name = table->dev_id;
+
+ lookup[0].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_DEVICE_WAKEUP);
+ lookup[1].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_SHUTDOWN);
+ lookup[2].chip_hwnum = get_gpio_by_name(TNG_BT_SFI_GPIO_HOST_WAKEUP);
+
+ gpiod_add_lookup_table(table);
+ return 0;
+}
+
+static struct bt_sfi_data tng_bt_sfi_data __initdata = {
+ .setup = tng_bt_sfi_setup,
+};
+
+#define ICPU(model, ddata) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (kernel_ulong_t)&ddata }
+
+static const struct x86_cpu_id bt_sfi_cpu_ids[] = {
+ ICPU(INTEL_FAM6_ATOM_MERRIFIELD, tng_bt_sfi_data),
+ {}
+};
+
+static int __init bt_sfi_init(void)
+{
+ struct platform_device_info info;
+ struct platform_device *pdev;
+ const struct x86_cpu_id *id;
+ struct bt_sfi_data *ddata;
+ int ret;
+
+ id = x86_match_cpu(bt_sfi_cpu_ids);
+ if (!id)
+ return -ENODEV;
+
+ ddata = (struct bt_sfi_data *)id->driver_data;
+ if (!ddata)
+ return -ENODEV;
+
+ ret = ddata->setup(ddata);
+ if (ret)
+ return ret;
+
+ memset(&info, 0, sizeof(info));
+ info.fwnode = ddata->dev->fwnode;
+ info.parent = ddata->dev;
+ info.name = ddata->name,
+ info.id = PLATFORM_DEVID_NONE,
+
+ pdev = platform_device_register_full(&info);
+ if (IS_ERR(pdev))
+ return PTR_ERR(pdev);
+
+ dev_info(ddata->dev, "Registered Bluetooth device: %s\n", ddata->name);
+ return 0;
+}
+device_initcall(bt_sfi_init);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index f25982cdff90..42e65fee5673 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -23,28 +23,7 @@
#include <asm/irq_vectors.h>
#include <asm/timer.h>
-static struct bau_operations ops;
-
-static struct bau_operations uv123_bau_ops = {
- .bau_gpa_to_offset = uv_gpa_to_offset,
- .read_l_sw_ack = read_mmr_sw_ack,
- .read_g_sw_ack = read_gmmr_sw_ack,
- .write_l_sw_ack = write_mmr_sw_ack,
- .write_g_sw_ack = write_gmmr_sw_ack,
- .write_payload_first = write_mmr_payload_first,
- .write_payload_last = write_mmr_payload_last,
-};
-
-static struct bau_operations uv4_bau_ops = {
- .bau_gpa_to_offset = uv_gpa_to_soc_phys_ram,
- .read_l_sw_ack = read_mmr_proc_sw_ack,
- .read_g_sw_ack = read_gmmr_proc_sw_ack,
- .write_l_sw_ack = write_mmr_proc_sw_ack,
- .write_g_sw_ack = write_gmmr_proc_sw_ack,
- .write_payload_first = write_mmr_proc_payload_first,
- .write_payload_last = write_mmr_proc_payload_last,
-};
-
+static struct bau_operations ops __ro_after_init;
/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
static int timeout_base_ns[] = {
@@ -548,11 +527,12 @@ static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
* return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
*/
static int uv1_wait_completion(struct bau_desc *bau_desc,
- unsigned long mmr_offset, int right_shift,
struct bau_control *bcp, long try)
{
unsigned long descriptor_status;
cycles_t ttm;
+ u64 mmr_offset = bcp->status_mmr;
+ int right_shift = bcp->status_index;
struct ptc_stats *stat = bcp->statp;
descriptor_status = uv1_read_status(mmr_offset, right_shift);
@@ -640,11 +620,12 @@ int handle_uv2_busy(struct bau_control *bcp)
}
static int uv2_3_wait_completion(struct bau_desc *bau_desc,
- unsigned long mmr_offset, int right_shift,
struct bau_control *bcp, long try)
{
unsigned long descriptor_stat;
cycles_t ttm;
+ u64 mmr_offset = bcp->status_mmr;
+ int right_shift = bcp->status_index;
int desc = bcp->uvhub_cpu;
long busy_reps = 0;
struct ptc_stats *stat = bcp->statp;
@@ -706,28 +687,59 @@ static int uv2_3_wait_completion(struct bau_desc *bau_desc,
}
/*
- * There are 2 status registers; each and array[32] of 2 bits. Set up for
- * which register to read and position in that register based on cpu in
- * current hub.
+ * Returns the status of current BAU message for cpu desc as a bit field
+ * [Error][Busy][Aux]
*/
-static int wait_completion(struct bau_desc *bau_desc, struct bau_control *bcp, long try)
+static u64 read_status(u64 status_mmr, int index, int desc)
{
- int right_shift;
- unsigned long mmr_offset;
+ u64 stat;
+
+ stat = ((read_lmmr(status_mmr) >> index) & UV_ACT_STATUS_MASK) << 1;
+ stat |= (read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_2) >> desc) & 0x1;
+
+ return stat;
+}
+
+static int uv4_wait_completion(struct bau_desc *bau_desc,
+ struct bau_control *bcp, long try)
+{
+ struct ptc_stats *stat = bcp->statp;
+ u64 descriptor_stat;
+ u64 mmr = bcp->status_mmr;
+ int index = bcp->status_index;
int desc = bcp->uvhub_cpu;
- if (desc < UV_CPUS_PER_AS) {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
- right_shift = desc * UV_ACT_STATUS_SIZE;
- } else {
- mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
- right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
- }
+ descriptor_stat = read_status(mmr, index, desc);
- if (bcp->uvhub_version == 1)
- return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
- else
- return uv2_3_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
+ /* spin on the status MMR, waiting for it to go idle */
+ while (descriptor_stat != UV2H_DESC_IDLE) {
+ switch (descriptor_stat) {
+ case UV2H_DESC_SOURCE_TIMEOUT:
+ stat->s_stimeout++;
+ return FLUSH_GIVEUP;
+
+ case UV2H_DESC_DEST_TIMEOUT:
+ stat->s_dtimeout++;
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_TIMEOUT;
+
+ case UV2H_DESC_DEST_STRONG_NACK:
+ stat->s_plugged++;
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_PLUGGED;
+
+ case UV2H_DESC_DEST_PUT_ERR:
+ bcp->conseccompletes = 0;
+ return FLUSH_GIVEUP;
+
+ default:
+ /* descriptor_stat is still BUSY */
+ cpu_relax();
+ }
+ descriptor_stat = read_status(mmr, index, desc);
+ }
+ bcp->conseccompletes++;
+ return FLUSH_COMPLETE;
}
/*
@@ -918,7 +930,7 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
struct uv1_bau_msg_header *uv1_hdr = NULL;
struct uv2_3_bau_msg_header *uv2_3_hdr = NULL;
- if (bcp->uvhub_version == 1) {
+ if (bcp->uvhub_version == UV_BAU_V1) {
uv1 = 1;
uv1_throttle(hmaster, stat);
}
@@ -958,7 +970,7 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
write_mmr_activation(index);
try++;
- completion_stat = wait_completion(bau_desc, bcp, try);
+ completion_stat = ops.wait_completion(bau_desc, bcp, try);
handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
@@ -1114,15 +1126,12 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
unsigned long end,
unsigned int cpu)
{
- int locals = 0;
- int remotes = 0;
- int hubs = 0;
+ int locals = 0, remotes = 0, hubs = 0;
struct bau_desc *bau_desc;
struct cpumask *flush_mask;
struct ptc_stats *stat;
struct bau_control *bcp;
- unsigned long descriptor_status;
- unsigned long status;
+ unsigned long descriptor_status, status, address;
bcp = &per_cpu(bau_control, cpu);
@@ -1171,10 +1180,24 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
if (!end || (end - start) <= PAGE_SIZE)
- bau_desc->payload.address = start;
+ address = start;
else
- bau_desc->payload.address = TLB_FLUSH_ALL;
- bau_desc->payload.sending_cpu = cpu;
+ address = TLB_FLUSH_ALL;
+
+ switch (bcp->uvhub_version) {
+ case UV_BAU_V1:
+ case UV_BAU_V2:
+ case UV_BAU_V3:
+ bau_desc->payload.uv1_2_3.address = address;
+ bau_desc->payload.uv1_2_3.sending_cpu = cpu;
+ break;
+ case UV_BAU_V4:
+ bau_desc->payload.uv4.address = address;
+ bau_desc->payload.uv4.sending_cpu = cpu;
+ bau_desc->payload.uv4.qualifier = BAU_DESC_QUALIFIER;
+ break;
+ }
+
/*
* uv_flush_send_and_wait returns 0 if all cpu's were messaged,
* or 1 if it gave up and the original cpumask should be returned.
@@ -1296,7 +1319,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
msgdesc.msg_slot = msg - msgdesc.queue_first;
msgdesc.msg = msg;
- if (bcp->uvhub_version == 2)
+ if (bcp->uvhub_version == UV_BAU_V2)
process_uv2_message(&msgdesc, bcp);
else
/* no error workaround for uv1 or uv3 */
@@ -1838,7 +1861,7 @@ static void pq_init(int node, int pnode)
* and the payload queue tail must be maintained by the kernel.
*/
bcp = &per_cpu(bau_control, smp_processor_id());
- if (bcp->uvhub_version <= 3) {
+ if (bcp->uvhub_version <= UV_BAU_V3) {
tail = first;
gnode = uv_gpa_to_gnode(uv_gpa(pqp));
first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail;
@@ -2034,8 +2057,7 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
struct bau_control **smasterp,
struct bau_control **hmasterp)
{
- int i;
- int cpu;
+ int i, cpu, uvhub_cpu;
struct bau_control *bcp;
for (i = 0; i < sdp->num_cpus; i++) {
@@ -2052,19 +2074,33 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
bcp->socket_master = *smasterp;
bcp->uvhub = bdp->uvhub;
if (is_uv1_hub())
- bcp->uvhub_version = 1;
+ bcp->uvhub_version = UV_BAU_V1;
else if (is_uv2_hub())
- bcp->uvhub_version = 2;
+ bcp->uvhub_version = UV_BAU_V2;
else if (is_uv3_hub())
- bcp->uvhub_version = 3;
+ bcp->uvhub_version = UV_BAU_V3;
else if (is_uv4_hub())
- bcp->uvhub_version = 4;
+ bcp->uvhub_version = UV_BAU_V4;
else {
pr_emerg("uvhub version not 1, 2, 3, or 4\n");
return 1;
}
bcp->uvhub_master = *hmasterp;
- bcp->uvhub_cpu = uv_cpu_blade_processor_id(cpu);
+ uvhub_cpu = uv_cpu_blade_processor_id(cpu);
+ bcp->uvhub_cpu = uvhub_cpu;
+
+ /*
+ * The ERROR and BUSY status registers are located pairwise over
+ * the STATUS_0 and STATUS_1 mmrs; each an array[32] of 2 bits.
+ */
+ if (uvhub_cpu < UV_CPUS_PER_AS) {
+ bcp->status_mmr = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+ bcp->status_index = uvhub_cpu * UV_ACT_STATUS_SIZE;
+ } else {
+ bcp->status_mmr = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+ bcp->status_index = (uvhub_cpu - UV_CPUS_PER_AS)
+ * UV_ACT_STATUS_SIZE;
+ }
if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
pr_emerg("%d cpus per uvhub invalid\n",
@@ -2147,6 +2183,39 @@ fail:
return 1;
}
+static const struct bau_operations uv1_bau_ops __initconst = {
+ .bau_gpa_to_offset = uv_gpa_to_offset,
+ .read_l_sw_ack = read_mmr_sw_ack,
+ .read_g_sw_ack = read_gmmr_sw_ack,
+ .write_l_sw_ack = write_mmr_sw_ack,
+ .write_g_sw_ack = write_gmmr_sw_ack,
+ .write_payload_first = write_mmr_payload_first,
+ .write_payload_last = write_mmr_payload_last,
+ .wait_completion = uv1_wait_completion,
+};
+
+static const struct bau_operations uv2_3_bau_ops __initconst = {
+ .bau_gpa_to_offset = uv_gpa_to_offset,
+ .read_l_sw_ack = read_mmr_sw_ack,
+ .read_g_sw_ack = read_gmmr_sw_ack,
+ .write_l_sw_ack = write_mmr_sw_ack,
+ .write_g_sw_ack = write_gmmr_sw_ack,
+ .write_payload_first = write_mmr_payload_first,
+ .write_payload_last = write_mmr_payload_last,
+ .wait_completion = uv2_3_wait_completion,
+};
+
+static const struct bau_operations uv4_bau_ops __initconst = {
+ .bau_gpa_to_offset = uv_gpa_to_soc_phys_ram,
+ .read_l_sw_ack = read_mmr_proc_sw_ack,
+ .read_g_sw_ack = read_gmmr_proc_sw_ack,
+ .write_l_sw_ack = write_mmr_proc_sw_ack,
+ .write_g_sw_ack = write_gmmr_proc_sw_ack,
+ .write_payload_first = write_mmr_proc_payload_first,
+ .write_payload_last = write_mmr_proc_payload_last,
+ .wait_completion = uv4_wait_completion,
+};
+
/*
* Initialization of BAU-related structures
*/
@@ -2166,11 +2235,11 @@ static int __init uv_bau_init(void)
if (is_uv4_hub())
ops = uv4_bau_ops;
else if (is_uv3_hub())
- ops = uv123_bau_ops;
+ ops = uv2_3_bau_ops;
else if (is_uv2_hub())
- ops = uv123_bau_ops;
+ ops = uv2_3_bau_ops;
else if (is_uv1_hub())
- ops = uv123_bau_ops;
+ ops = uv1_bau_ops;
for_each_possible_cpu(cur_cpu) {
mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 66ade16c7693..6b05a9219ea2 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -95,7 +95,7 @@ static void __save_processor_state(struct saved_context *ctxt)
* 'pmode_gdt' in wakeup_start.
*/
ctxt->gdt_desc.size = GDT_SIZE - 1;
- ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_table(smp_processor_id());
+ ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id());
store_tr(ctxt->tr);
@@ -162,7 +162,7 @@ static void fix_processor_context(void)
int cpu = smp_processor_id();
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64
- struct desc_struct *desc = get_cpu_gdt_table(cpu);
+ struct desc_struct *desc = get_cpu_gdt_rw(cpu);
tss_desc tss;
#endif
set_tss_desc(cpu, t); /*
@@ -183,6 +183,9 @@ static void fix_processor_context(void)
load_mm_ldt(current->active_mm); /* This does lldt */
fpu__resume_cpu();
+
+ /* The processor is back on the direct GDT, load back the fixmap */
+ load_fixmap_gdt(cpu);
}
/**
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 9f14bd34581d..c35fdb585c68 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -32,6 +32,7 @@ pgd_t *resume_pg_dir;
*/
static pmd_t *resume_one_md_table_init(pgd_t *pgd)
{
+ p4d_t *p4d;
pud_t *pud;
pmd_t *pmd_table;
@@ -41,11 +42,13 @@ static pmd_t *resume_one_md_table_init(pgd_t *pgd)
return NULL;
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
- pud = pud_offset(pgd, 0);
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
#else
- pud = pud_offset(pgd, 0);
+ p4d = p4d_offset(pgd, 0);
+ pud = pud_offset(p4d, 0);
pmd_table = pmd_offset(pud, 0);
#endif
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index ded2e8272382..2a9f993bbbf0 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -49,6 +49,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
{
pmd_t *pmd;
pud_t *pud;
+ p4d_t *p4d;
/*
* The new mapping only has to cover the page containing the image
@@ -63,6 +64,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
* the virtual address space after switching over to the original page
* tables used by the image kernel.
*/
+
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
+ if (!p4d)
+ return -ENOMEM;
+ }
+
pud = (pud_t *)get_safe_page(GFP_ATOMIC);
if (!pud)
return -ENOMEM;
@@ -75,8 +83,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
__pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC));
set_pud(pud + pud_index(restore_jump_address),
__pud(__pa(pmd) | _KERNPG_TABLE));
- set_pgd(pgd + pgd_index(restore_jump_address),
- __pgd(__pa(pud) | _KERNPG_TABLE));
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE));
+ set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE));
+ } else {
+ /* No p4d for 4-level paging: point the pgd to the pud page table */
+ set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE));
+ }
return 0;
}
@@ -124,7 +137,10 @@ static int set_up_temporary_mappings(void)
static int relocate_restore_code(void)
{
pgd_t *pgd;
+ p4d_t *p4d;
pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
relocated_restore_code = get_safe_page(GFP_ATOMIC);
if (!relocated_restore_code)
@@ -134,22 +150,25 @@ static int relocate_restore_code(void)
/* Make the page containing the relocated code executable */
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
- pud = pud_offset(pgd, relocated_restore_code);
+ p4d = p4d_offset(pgd, relocated_restore_code);
+ if (p4d_large(*p4d)) {
+ set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
+ goto out;
+ }
+ pud = pud_offset(p4d, relocated_restore_code);
if (pud_large(*pud)) {
set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
- } else {
- pmd_t *pmd = pmd_offset(pud, relocated_restore_code);
-
- if (pmd_large(*pmd)) {
- set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
- } else {
- pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code);
-
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
- }
+ goto out;
+ }
+ pmd = pmd_offset(pud, relocated_restore_code);
+ if (pmd_large(*pmd)) {
+ set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
+ goto out;
}
+ pte = pte_offset_kernel(pmd, relocated_restore_code);
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
+out:
__flush_tlb_all();
-
return 0;
}
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index 0bc60a308730..2a2d89d39af6 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -7,3 +7,17 @@ config MCE_AMD_INJ
aspects of the MCE handling code.
WARNING: Do not even assume this interface is staying stable!
+
+config RAS_CEC
+ bool "Correctable Errors Collector"
+ depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
+ ---help---
+ This is a small cache which collects correctable memory errors per 4K
+ page PFN and counts their repeated occurrence. Once the counter for a
+ PFN overflows, we try to soft-offline that page as we take it to mean
+ that it has reached a relatively high error count and would probably
+ be best if we don't use it anymore.
+
+ Bear in mind that this is absolutely useless if your platform doesn't
+ have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
+
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index e7e7055a8658..46cbbfe03285 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -8,7 +8,7 @@ else
BITS := 64
endif
-obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \
+obj-y = bugs_$(BITS).o delay.o fault.o ldt.o \
ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \
stub_$(BITS).o stub_segv.o \
sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \
@@ -16,7 +16,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \
ifeq ($(CONFIG_X86_32),y)
-obj-y += checksum_32.o
+obj-y += checksum_32.o syscalls_32.o
obj-$(CONFIG_ELF_CORE) += elfcore.o
subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h
index e59eef20647b..b291ca5cf66b 100644
--- a/arch/x86/um/asm/ptrace.h
+++ b/arch/x86/um/asm/ptrace.h
@@ -78,7 +78,7 @@ static inline int ptrace_set_thread_area(struct task_struct *child, int idx,
return -ENOSYS;
}
-extern long arch_prctl(struct task_struct *task, int code,
+extern long arch_prctl(struct task_struct *task, int option,
unsigned long __user *addr);
#endif
diff --git a/arch/x86/um/bug.c b/arch/x86/um/bug.c
deleted file mode 100644
index e8034e363d83..000000000000
--- a/arch/x86/um/bug.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (C) 2006 Jeff Dike (jdike@addtoit.com)
- * Licensed under the GPL V2
- */
-
-#include <linux/uaccess.h>
-
-/*
- * Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because
- * that's not relevant in skas mode.
- */
-
-int is_valid_bugaddr(unsigned long eip)
-{
- unsigned short ud2;
-
- if (probe_kernel_address((unsigned short __user *)eip, ud2))
- return 0;
-
- return ud2 == 0x0b0f;
-}
diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c
index 96eb2bd28832..8431e87ac333 100644
--- a/arch/x86/um/os-Linux/prctl.c
+++ b/arch/x86/um/os-Linux/prctl.c
@@ -6,7 +6,7 @@
#include <sys/ptrace.h>
#include <asm/ptrace.h>
-int os_arch_prctl(int pid, int code, unsigned long *addr)
+int os_arch_prctl(int pid, int option, unsigned long *arg2)
{
- return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) addr, code);
+ return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) arg2, option);
}
diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c
new file mode 100644
index 000000000000..627d68836b16
--- /dev/null
+++ b/arch/x86/um/syscalls_32.c
@@ -0,0 +1,7 @@
+#include <linux/syscalls.h>
+#include <os.h>
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+ return -EINVAL;
+}
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index 10d907098c26..58f51667e2e4 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -7,13 +7,15 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
+#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <asm/prctl.h> /* XXX This should get the constants from libc */
#include <os.h>
-long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
+long arch_prctl(struct task_struct *task, int option,
+ unsigned long __user *arg2)
{
- unsigned long *ptr = addr, tmp;
+ unsigned long *ptr = arg2, tmp;
long ret;
int pid = task->mm->context.id.u.pid;
@@ -30,7 +32,7 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
* arch_prctl is run on the host, then the registers are read
* back.
*/
- switch (code) {
+ switch (option) {
case ARCH_SET_FS:
case ARCH_SET_GS:
ret = restore_registers(pid, &current->thread.regs.regs);
@@ -50,11 +52,11 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
ptr = &tmp;
}
- ret = os_arch_prctl(pid, code, ptr);
+ ret = os_arch_prctl(pid, option, ptr);
if (ret)
return ret;
- switch (code) {
+ switch (option) {
case ARCH_SET_FS:
current->thread.arch.fs = (unsigned long) ptr;
ret = save_registers(pid, &current->thread.regs.regs);
@@ -63,19 +65,19 @@ long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)
ret = save_registers(pid, &current->thread.regs.regs);
break;
case ARCH_GET_FS:
- ret = put_user(tmp, addr);
+ ret = put_user(tmp, arg2);
break;
case ARCH_GET_GS:
- ret = put_user(tmp, addr);
+ ret = put_user(tmp, arg2);
break;
}
return ret;
}
-long sys_arch_prctl(int code, unsigned long addr)
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
{
- return arch_prctl(current, code, (unsigned long __user *) addr);
+ return arch_prctl(current, option, (unsigned long __user *) arg2);
}
void arch_switch_to(struct task_struct *to)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ec1d5c46e58f..6efa0cc425a2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -710,7 +710,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
*shadow = t->tls_array[i];
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
mc = __xen_mc_entry(0);
@@ -1595,7 +1595,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
- new_cpu_data.wp_works_ok = 1;
new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
#endif
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 37cb5aad71de..4d4b7bc48f5d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -535,40 +535,41 @@ static pgd_t *xen_get_user_pgd(pgd_t *pgd)
return user_ptr;
}
-static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
{
struct mmu_update u;
u.ptr = virt_to_machine(ptr).maddr;
- u.val = pgd_val_ma(val);
+ u.val = p4d_val_ma(val);
xen_extend_mmu_update(&u);
}
/*
- * Raw hypercall-based set_pgd, intended for in early boot before
+ * Raw hypercall-based set_p4d, intended for in early boot before
* there's a page structure. This implies:
* 1. The only existing pagetable is the kernel's
* 2. It is always pinned
* 3. It has no user pagetable attached to it
*/
-static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
{
preempt_disable();
xen_mc_batch();
- __xen_set_pgd_hyper(ptr, val);
+ __xen_set_p4d_hyper(ptr, val);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
-static void xen_set_pgd(pgd_t *ptr, pgd_t val)
+static void xen_set_p4d(p4d_t *ptr, p4d_t val)
{
- pgd_t *user_ptr = xen_get_user_pgd(ptr);
+ pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
+ pgd_t pgd_val;
- trace_xen_mmu_set_pgd(ptr, user_ptr, val);
+ trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
/* If page is not pinned, we can just update the entry
directly */
@@ -576,7 +577,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
*ptr = val;
if (user_ptr) {
WARN_ON(xen_page_pinned(user_ptr));
- *user_ptr = val;
+ pgd_val.pgd = p4d_val_ma(val);
+ *user_ptr = pgd_val;
}
return;
}
@@ -585,14 +587,72 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
user updates together. */
xen_mc_batch();
- __xen_set_pgd_hyper(ptr, val);
+ __xen_set_p4d_hyper(ptr, val);
if (user_ptr)
- __xen_set_pgd_hyper(user_ptr, val);
+ __xen_set_p4d_hyper((p4d_t *)user_ptr, val);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
+ int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+ bool last, unsigned long limit)
+{
+ int i, nr, flush = 0;
+
+ nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
+ for (i = 0; i < nr; i++) {
+ if (!pmd_none(pmd[i]))
+ flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
+ }
+ return flush;
+}
+
+static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
+ int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+ bool last, unsigned long limit)
+{
+ int i, nr, flush = 0;
+
+ nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
+ for (i = 0; i < nr; i++) {
+ pmd_t *pmd;
+
+ if (pud_none(pud[i]))
+ continue;
+
+ pmd = pmd_offset(&pud[i], 0);
+ if (PTRS_PER_PMD > 1)
+ flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
+ flush |= xen_pmd_walk(mm, pmd, func,
+ last && i == nr - 1, limit);
+ }
+ return flush;
+}
+
+static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
+ int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+ bool last, unsigned long limit)
+{
+ int i, nr, flush = 0;
+
+ nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
+ for (i = 0; i < nr; i++) {
+ pud_t *pud;
+
+ if (p4d_none(p4d[i]))
+ continue;
+
+ pud = pud_offset(&p4d[i], 0);
+ if (PTRS_PER_PUD > 1)
+ flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
+ flush |= xen_pud_walk(mm, pud, func,
+ last && i == nr - 1, limit);
+ }
+ return flush;
+}
+
/*
* (Yet another) pagetable walker. This one is intended for pinning a
* pagetable. This means that it walks a pagetable and calls the
@@ -613,10 +673,8 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
enum pt_level),
unsigned long limit)
{
- int flush = 0;
+ int i, nr, flush = 0;
unsigned hole_low, hole_high;
- unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
- unsigned pgdidx, pudidx, pmdidx;
/* The limit is the last byte to be touched */
limit--;
@@ -633,65 +691,22 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
hole_low = pgd_index(USER_LIMIT);
hole_high = pgd_index(PAGE_OFFSET);
- pgdidx_limit = pgd_index(limit);
-#if PTRS_PER_PUD > 1
- pudidx_limit = pud_index(limit);
-#else
- pudidx_limit = 0;
-#endif
-#if PTRS_PER_PMD > 1
- pmdidx_limit = pmd_index(limit);
-#else
- pmdidx_limit = 0;
-#endif
-
- for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
- pud_t *pud;
+ nr = pgd_index(limit) + 1;
+ for (i = 0; i < nr; i++) {
+ p4d_t *p4d;
- if (pgdidx >= hole_low && pgdidx < hole_high)
+ if (i >= hole_low && i < hole_high)
continue;
- if (!pgd_val(pgd[pgdidx]))
+ if (pgd_none(pgd[i]))
continue;
- pud = pud_offset(&pgd[pgdidx], 0);
-
- if (PTRS_PER_PUD > 1) /* not folded */
- flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
-
- for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
- pmd_t *pmd;
-
- if (pgdidx == pgdidx_limit &&
- pudidx > pudidx_limit)
- goto out;
-
- if (pud_none(pud[pudidx]))
- continue;
-
- pmd = pmd_offset(&pud[pudidx], 0);
-
- if (PTRS_PER_PMD > 1) /* not folded */
- flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
-
- for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
- struct page *pte;
-
- if (pgdidx == pgdidx_limit &&
- pudidx == pudidx_limit &&
- pmdidx > pmdidx_limit)
- goto out;
-
- if (pmd_none(pmd[pmdidx]))
- continue;
-
- pte = pmd_page(pmd[pmdidx]);
- flush |= (*func)(mm, pte, PT_PTE);
- }
- }
+ p4d = p4d_offset(&pgd[i], 0);
+ if (PTRS_PER_P4D > 1)
+ flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
+ flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
}
-out:
/* Do the top level last, so that the callbacks can use it as
a cue to do final things like tlb flushes. */
flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
@@ -1150,57 +1165,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
xen_free_ro_pages(pa, PAGE_SIZE);
}
+static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
+{
+ unsigned long pa;
+ pte_t *pte_tbl;
+ int i;
+
+ if (pmd_large(*pmd)) {
+ pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PMD_SIZE);
+ return;
+ }
+
+ pte_tbl = pte_offset_kernel(pmd, 0);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ if (pte_none(pte_tbl[i]))
+ continue;
+ pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
+ xen_free_ro_pages(pa, PAGE_SIZE);
+ }
+ set_pmd(pmd, __pmd(0));
+ xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
+}
+
+static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
+{
+ unsigned long pa;
+ pmd_t *pmd_tbl;
+ int i;
+
+ if (pud_large(*pud)) {
+ pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PUD_SIZE);
+ return;
+ }
+
+ pmd_tbl = pmd_offset(pud, 0);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (pmd_none(pmd_tbl[i]))
+ continue;
+ xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
+ }
+ set_pud(pud, __pud(0));
+ xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
+}
+
+static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
+{
+ unsigned long pa;
+ pud_t *pud_tbl;
+ int i;
+
+ if (p4d_large(*p4d)) {
+ pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, P4D_SIZE);
+ return;
+ }
+
+ pud_tbl = pud_offset(p4d, 0);
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ if (pud_none(pud_tbl[i]))
+ continue;
+ xen_cleanmfnmap_pud(pud_tbl + i, unpin);
+ }
+ set_p4d(p4d, __p4d(0));
+ xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
+}
+
/*
* Since it is well isolated we can (and since it is perhaps large we should)
* also free the page tables mapping the initial P->M table.
*/
static void __init xen_cleanmfnmap(unsigned long vaddr)
{
- unsigned long va = vaddr & PMD_MASK;
- unsigned long pa;
- pgd_t *pgd = pgd_offset_k(va);
- pud_t *pud_page = pud_offset(pgd, 0);
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ pgd_t *pgd;
+ p4d_t *p4d;
unsigned int i;
bool unpin;
unpin = (vaddr == 2 * PGDIR_SIZE);
- set_pgd(pgd, __pgd(0));
- do {
- pud = pud_page + pud_index(va);
- if (pud_none(*pud)) {
- va += PUD_SIZE;
- } else if (pud_large(*pud)) {
- pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
- xen_free_ro_pages(pa, PUD_SIZE);
- va += PUD_SIZE;
- } else {
- pmd = pmd_offset(pud, va);
- if (pmd_large(*pmd)) {
- pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
- xen_free_ro_pages(pa, PMD_SIZE);
- } else if (!pmd_none(*pmd)) {
- pte = pte_offset_kernel(pmd, va);
- set_pmd(pmd, __pmd(0));
- for (i = 0; i < PTRS_PER_PTE; ++i) {
- if (pte_none(pte[i]))
- break;
- pa = pte_pfn(pte[i]) << PAGE_SHIFT;
- xen_free_ro_pages(pa, PAGE_SIZE);
- }
- xen_cleanmfnmap_free_pgtbl(pte, unpin);
- }
- va += PMD_SIZE;
- if (pmd_index(va))
- continue;
- set_pud(pud, __pud(0));
- xen_cleanmfnmap_free_pgtbl(pmd, unpin);
- }
-
- } while (pud_index(va) || pmd_index(va));
- xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
+ vaddr &= PMD_MASK;
+ pgd = pgd_offset_k(vaddr);
+ p4d = p4d_offset(pgd, 0);
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ if (p4d_none(p4d[i]))
+ continue;
+ xen_cleanmfnmap_p4d(p4d + i, unpin);
+ }
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ set_pgd(pgd, __pgd(0));
+ xen_cleanmfnmap_free_pgtbl(p4d, unpin);
+ }
}
static void __init xen_pagetable_p2m_free(void)
@@ -1538,7 +1593,6 @@ static int xen_pgd_alloc(struct mm_struct *mm)
BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
}
#endif
-
return ret;
}
@@ -1730,7 +1784,7 @@ static void xen_release_pmd(unsigned long pfn)
xen_release_ptpage(pfn, PT_PMD);
}
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2071,21 +2125,27 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
*/
void __init xen_relocate_p2m(void)
{
- phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
+ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
- int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
+ int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
pte_t *pt;
pmd_t *pmd;
pud_t *pud;
+ p4d_t *p4d = NULL;
pgd_t *pgd;
unsigned long *new_p2m;
+ int save_pud;
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
- n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
- n_frames = n_pte + n_pt + n_pmd + n_pud;
+ n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
+ if (PTRS_PER_P4D > 1)
+ n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
+ else
+ n_p4d = 0;
+ n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
new_area = xen_find_free_area(PFN_PHYS(n_frames));
if (!new_area) {
@@ -2101,55 +2161,76 @@ void __init xen_relocate_p2m(void)
* To avoid any possible virtual address collision, just use
* 2 * PUD_SIZE for the new area.
*/
- pud_phys = new_area;
+ p4d_phys = new_area;
+ pud_phys = p4d_phys + PFN_PHYS(n_p4d);
pmd_phys = pud_phys + PFN_PHYS(n_pud);
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
pgd = __va(read_cr3());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
- for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
- pud = early_memremap(pud_phys, PAGE_SIZE);
- clear_page(pud);
- for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
- idx_pmd++) {
- pmd = early_memremap(pmd_phys, PAGE_SIZE);
- clear_page(pmd);
- for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
- idx_pt++) {
- pt = early_memremap(pt_phys, PAGE_SIZE);
- clear_page(pt);
- for (idx_pte = 0;
- idx_pte < min(n_pte, PTRS_PER_PTE);
- idx_pte++) {
- set_pte(pt + idx_pte,
- pfn_pte(p2m_pfn, PAGE_KERNEL));
- p2m_pfn++;
+ idx_p4d = 0;
+ save_pud = n_pud;
+ do {
+ if (n_p4d > 0) {
+ p4d = early_memremap(p4d_phys, PAGE_SIZE);
+ clear_page(p4d);
+ n_pud = min(save_pud, PTRS_PER_P4D);
+ }
+ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+ pud = early_memremap(pud_phys, PAGE_SIZE);
+ clear_page(pud);
+ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+ idx_pmd++) {
+ pmd = early_memremap(pmd_phys, PAGE_SIZE);
+ clear_page(pmd);
+ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+ idx_pt++) {
+ pt = early_memremap(pt_phys, PAGE_SIZE);
+ clear_page(pt);
+ for (idx_pte = 0;
+ idx_pte < min(n_pte, PTRS_PER_PTE);
+ idx_pte++) {
+ set_pte(pt + idx_pte,
+ pfn_pte(p2m_pfn, PAGE_KERNEL));
+ p2m_pfn++;
+ }
+ n_pte -= PTRS_PER_PTE;
+ early_memunmap(pt, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pt_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+ PFN_DOWN(pt_phys));
+ set_pmd(pmd + idx_pt,
+ __pmd(_PAGE_TABLE | pt_phys));
+ pt_phys += PAGE_SIZE;
}
- n_pte -= PTRS_PER_PTE;
- early_memunmap(pt, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pt_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
- PFN_DOWN(pt_phys));
- set_pmd(pmd + idx_pt,
- __pmd(_PAGE_TABLE | pt_phys));
- pt_phys += PAGE_SIZE;
+ n_pt -= PTRS_PER_PMD;
+ early_memunmap(pmd, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pmd_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+ PFN_DOWN(pmd_phys));
+ set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+ pmd_phys += PAGE_SIZE;
}
- n_pt -= PTRS_PER_PMD;
- early_memunmap(pmd, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pmd_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
- PFN_DOWN(pmd_phys));
- set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
- pmd_phys += PAGE_SIZE;
+ n_pmd -= PTRS_PER_PUD;
+ early_memunmap(pud, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pud_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+ if (n_p4d > 0)
+ set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
+ else
+ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+ pud_phys += PAGE_SIZE;
}
- n_pmd -= PTRS_PER_PUD;
- early_memunmap(pud, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pud_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
- set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
- pud_phys += PAGE_SIZE;
- }
+ if (n_p4d > 0) {
+ save_pud -= PTRS_PER_P4D;
+ early_memunmap(p4d, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(p4d_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
+ set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
+ p4d_phys += PAGE_SIZE;
+ }
+ } while (++idx_p4d < n_p4d);
/* Now copy the old p2m info to the new area. */
memcpy(new_p2m, xen_p2m_addr, size);
@@ -2326,6 +2407,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
+ case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;
@@ -2378,8 +2460,8 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
-#if CONFIG_PGTABLE_LEVELS == 4
- pv_mmu_ops.set_pgd = xen_set_pgd;
+#if CONFIG_PGTABLE_LEVELS >= 4
+ pv_mmu_ops.set_p4d = xen_set_p4d;
#endif
/* This will work as long as patching hasn't happened yet
@@ -2388,7 +2470,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
#endif
@@ -2454,10 +2536,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
- .set_pgd = xen_set_pgd_hyper,
+ .set_p4d = xen_set_p4d_hyper,
.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 73809bb951b4..3fe2b3292915 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -5,6 +5,7 @@
enum pt_level {
PT_PGD,
+ PT_P4D,
PT_PUD,
PT_PMD,
PT_PTE
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 7ff2f1bfb7ec..eaa36162ed4a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -392,7 +392,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
if (ctxt == NULL)
return -ENOMEM;
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 83e5f7e1a20d..dad02c0f21b9 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -440,7 +440,7 @@ config ACPI_CUSTOM_METHOD
config ACPI_BGRT
bool "Boottime Graphics Resource Table support"
- depends on EFI && X86
+ depends on EFI && (X86 || ARM64)
help
This driver adds support for exposing the ACPI Boottime Graphics
Resource Table, which allows the operating system to obtain
diff --git a/drivers/acpi/bgrt.c b/drivers/acpi/bgrt.c
index ca28aa572aa9..df1c629205e7 100644
--- a/drivers/acpi/bgrt.c
+++ b/drivers/acpi/bgrt.c
@@ -81,6 +81,12 @@ static struct attribute_group bgrt_attribute_group = {
.bin_attrs = bgrt_bin_attributes,
};
+int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+ efi_bgrt_init(table);
+ return 0;
+}
+
static int __init bgrt_init(void)
{
int ret;
diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c
index aff87df07449..bc48cbf6a795 100644
--- a/drivers/clocksource/em_sti.c
+++ b/drivers/clocksource/em_sti.c
@@ -78,15 +78,12 @@ static int em_sti_enable(struct em_sti_priv *p)
int ret;
/* enable clock */
- ret = clk_prepare_enable(p->clk);
+ ret = clk_enable(p->clk);
if (ret) {
dev_err(&p->pdev->dev, "cannot enable clock\n");
return ret;
}
- /* configure channel, periodic mode and maximum timeout */
- p->rate = clk_get_rate(p->clk);
-
/* reset the counter */
em_sti_write(p, STI_SET_H, 0x40000000);
em_sti_write(p, STI_SET_L, 0x00000000);
@@ -107,7 +104,7 @@ static void em_sti_disable(struct em_sti_priv *p)
em_sti_write(p, STI_INTENCLR, 3);
/* stop clock */
- clk_disable_unprepare(p->clk);
+ clk_disable(p->clk);
}
static u64 em_sti_count(struct em_sti_priv *p)
@@ -205,13 +202,9 @@ static u64 em_sti_clocksource_read(struct clocksource *cs)
static int em_sti_clocksource_enable(struct clocksource *cs)
{
- int ret;
struct em_sti_priv *p = cs_to_em_sti(cs);
- ret = em_sti_start(p, USER_CLOCKSOURCE);
- if (!ret)
- __clocksource_update_freq_hz(cs, p->rate);
- return ret;
+ return em_sti_start(p, USER_CLOCKSOURCE);
}
static void em_sti_clocksource_disable(struct clocksource *cs)
@@ -240,8 +233,7 @@ static int em_sti_register_clocksource(struct em_sti_priv *p)
dev_info(&p->pdev->dev, "used as clock source\n");
- /* Register with dummy 1 Hz value, gets updated in ->enable() */
- clocksource_register_hz(cs, 1);
+ clocksource_register_hz(cs, p->rate);
return 0;
}
@@ -263,7 +255,6 @@ static int em_sti_clock_event_set_oneshot(struct clock_event_device *ced)
dev_info(&p->pdev->dev, "used for oneshot clock events\n");
em_sti_start(p, USER_CLOCKEVENT);
- clockevents_config(&p->ced, p->rate);
return 0;
}
@@ -294,8 +285,7 @@ static void em_sti_register_clockevent(struct em_sti_priv *p)
dev_info(&p->pdev->dev, "used for clock events\n");
- /* Register with dummy 1 Hz value, gets updated in ->set_state_oneshot() */
- clockevents_config_and_register(ced, 1, 2, 0xffffffff);
+ clockevents_config_and_register(ced, p->rate, 2, 0xffffffff);
}
static int em_sti_probe(struct platform_device *pdev)
@@ -303,6 +293,7 @@ static int em_sti_probe(struct platform_device *pdev)
struct em_sti_priv *p;
struct resource *res;
int irq;
+ int ret;
p = devm_kzalloc(&pdev->dev, sizeof(*p), GFP_KERNEL);
if (p == NULL)
@@ -323,6 +314,13 @@ static int em_sti_probe(struct platform_device *pdev)
if (IS_ERR(p->base))
return PTR_ERR(p->base);
+ if (devm_request_irq(&pdev->dev, irq, em_sti_interrupt,
+ IRQF_TIMER | IRQF_IRQPOLL | IRQF_NOBALANCING,
+ dev_name(&pdev->dev), p)) {
+ dev_err(&pdev->dev, "failed to request low IRQ\n");
+ return -ENOENT;
+ }
+
/* get hold of clock */
p->clk = devm_clk_get(&pdev->dev, "sclk");
if (IS_ERR(p->clk)) {
@@ -330,12 +328,20 @@ static int em_sti_probe(struct platform_device *pdev)
return PTR_ERR(p->clk);
}
- if (devm_request_irq(&pdev->dev, irq, em_sti_interrupt,
- IRQF_TIMER | IRQF_IRQPOLL | IRQF_NOBALANCING,
- dev_name(&pdev->dev), p)) {
- dev_err(&pdev->dev, "failed to request low IRQ\n");
- return -ENOENT;
+ ret = clk_prepare(p->clk);
+ if (ret < 0) {
+ dev_err(&pdev->dev, "cannot prepare clock\n");
+ return ret;
+ }
+
+ ret = clk_enable(p->clk);
+ if (ret < 0) {
+ dev_err(&p->pdev->dev, "cannot enable clock\n");
+ clk_unprepare(p->clk);
+ return ret;
}
+ p->rate = clk_get_rate(p->clk);
+ clk_disable(p->clk);
raw_spin_lock_init(&p->lock);
em_sti_register_clockevent(p);
diff --git a/drivers/clocksource/h8300_timer8.c b/drivers/clocksource/h8300_timer8.c
index 546bb180f5a4..804c489531d6 100644
--- a/drivers/clocksource/h8300_timer8.c
+++ b/drivers/clocksource/h8300_timer8.c
@@ -101,15 +101,7 @@ static inline struct timer8_priv *ced_to_priv(struct clock_event_device *ced)
static void timer8_clock_event_start(struct timer8_priv *p, unsigned long delta)
{
- struct clock_event_device *ced = &p->ced;
-
timer8_start(p);
-
- ced->shift = 32;
- ced->mult = div_sc(p->rate, NSEC_PER_SEC, ced->shift);
- ced->max_delta_ns = clockevent_delta2ns(0xffff, ced);
- ced->min_delta_ns = clockevent_delta2ns(0x0001, ced);
-
timer8_set_next(p, delta);
}
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 28757edf6aca..e3bf3baa12cc 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -103,7 +103,6 @@ struct sh_cmt_channel {
unsigned long match_value;
unsigned long next_match_value;
unsigned long max_match_value;
- unsigned long rate;
raw_spinlock_t lock;
struct clock_event_device ced;
struct clocksource cs;
@@ -118,6 +117,7 @@ struct sh_cmt_device {
void __iomem *mapbase;
struct clk *clk;
+ unsigned long rate;
raw_spinlock_t lock; /* Protect the shared start/stop register */
@@ -320,7 +320,7 @@ static void sh_cmt_start_stop_ch(struct sh_cmt_channel *ch, int start)
raw_spin_unlock_irqrestore(&ch->cmt->lock, flags);
}
-static int sh_cmt_enable(struct sh_cmt_channel *ch, unsigned long *rate)
+static int sh_cmt_enable(struct sh_cmt_channel *ch)
{
int k, ret;
@@ -340,11 +340,9 @@ static int sh_cmt_enable(struct sh_cmt_channel *ch, unsigned long *rate)
/* configure channel, periodic mode and maximum timeout */
if (ch->cmt->info->width == 16) {
- *rate = clk_get_rate(ch->cmt->clk) / 512;
sh_cmt_write_cmcsr(ch, SH_CMT16_CMCSR_CMIE |
SH_CMT16_CMCSR_CKS512);
} else {
- *rate = clk_get_rate(ch->cmt->clk) / 8;
sh_cmt_write_cmcsr(ch, SH_CMT32_CMCSR_CMM |
SH_CMT32_CMCSR_CMTOUT_IE |
SH_CMT32_CMCSR_CMR_IRQ |
@@ -572,7 +570,7 @@ static int sh_cmt_start(struct sh_cmt_channel *ch, unsigned long flag)
raw_spin_lock_irqsave(&ch->lock, flags);
if (!(ch->flags & (FLAG_CLOCKEVENT | FLAG_CLOCKSOURCE)))
- ret = sh_cmt_enable(ch, &ch->rate);
+ ret = sh_cmt_enable(ch);
if (ret)
goto out;
@@ -640,10 +638,9 @@ static int sh_cmt_clocksource_enable(struct clocksource *cs)
ch->total_cycles = 0;
ret = sh_cmt_start(ch, FLAG_CLOCKSOURCE);
- if (!ret) {
- __clocksource_update_freq_hz(cs, ch->rate);
+ if (!ret)
ch->cs_enabled = true;
- }
+
return ret;
}
@@ -697,8 +694,7 @@ static int sh_cmt_register_clocksource(struct sh_cmt_channel *ch,
dev_info(&ch->cmt->pdev->dev, "ch%u: used as clock source\n",
ch->index);
- /* Register with dummy 1 Hz value, gets updated in ->enable() */
- clocksource_register_hz(cs, 1);
+ clocksource_register_hz(cs, ch->cmt->rate);
return 0;
}
@@ -709,19 +705,10 @@ static struct sh_cmt_channel *ced_to_sh_cmt(struct clock_event_device *ced)
static void sh_cmt_clock_event_start(struct sh_cmt_channel *ch, int periodic)
{
- struct clock_event_device *ced = &ch->ced;
-
sh_cmt_start(ch, FLAG_CLOCKEVENT);
- /* TODO: calculate good shift from rate and counter bit width */
-
- ced->shift = 32;
- ced->mult = div_sc(ch->rate, NSEC_PER_SEC, ced->shift);
- ced->max_delta_ns = clockevent_delta2ns(ch->max_match_value, ced);
- ced->min_delta_ns = clockevent_delta2ns(0x1f, ced);
-
if (periodic)
- sh_cmt_set_next(ch, ((ch->rate + HZ/2) / HZ) - 1);
+ sh_cmt_set_next(ch, ((ch->cmt->rate + HZ/2) / HZ) - 1);
else
sh_cmt_set_next(ch, ch->max_match_value);
}
@@ -824,6 +811,12 @@ static int sh_cmt_register_clockevent(struct sh_cmt_channel *ch,
ced->suspend = sh_cmt_clock_event_suspend;
ced->resume = sh_cmt_clock_event_resume;
+ /* TODO: calculate good shift from rate and counter bit width */
+ ced->shift = 32;
+ ced->mult = div_sc(ch->cmt->rate, NSEC_PER_SEC, ced->shift);
+ ced->max_delta_ns = clockevent_delta2ns(ch->max_match_value, ced);
+ ced->min_delta_ns = clockevent_delta2ns(0x1f, ced);
+
dev_info(&ch->cmt->pdev->dev, "ch%u: used for clock events\n",
ch->index);
clockevents_register_device(ced);
@@ -996,6 +989,18 @@ static int sh_cmt_setup(struct sh_cmt_device *cmt, struct platform_device *pdev)
if (ret < 0)
goto err_clk_put;
+ /* Determine clock rate. */
+ ret = clk_enable(cmt->clk);
+ if (ret < 0)
+ goto err_clk_unprepare;
+
+ if (cmt->info->width == 16)
+ cmt->rate = clk_get_rate(cmt->clk) / 512;
+ else
+ cmt->rate = clk_get_rate(cmt->clk) / 8;
+
+ clk_disable(cmt->clk);
+
/* Map the memory resource(s). */
ret = sh_cmt_map_memory(cmt);
if (ret < 0)
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
index 1fbf2aadcfd4..31d881621e41 100644
--- a/drivers/clocksource/sh_tmu.c
+++ b/drivers/clocksource/sh_tmu.c
@@ -46,7 +46,6 @@ struct sh_tmu_channel {
void __iomem *base;
int irq;
- unsigned long rate;
unsigned long periodic;
struct clock_event_device ced;
struct clocksource cs;
@@ -59,6 +58,7 @@ struct sh_tmu_device {
void __iomem *mapbase;
struct clk *clk;
+ unsigned long rate;
enum sh_tmu_model model;
@@ -165,7 +165,6 @@ static int __sh_tmu_enable(struct sh_tmu_channel *ch)
sh_tmu_write(ch, TCNT, 0xffffffff);
/* configure channel to parent clock / 4, irq off */
- ch->rate = clk_get_rate(ch->tmu->clk) / 4;
sh_tmu_write(ch, TCR, TCR_TPSC_CLK4);
/* enable channel */
@@ -271,10 +270,8 @@ static int sh_tmu_clocksource_enable(struct clocksource *cs)
return 0;
ret = sh_tmu_enable(ch);
- if (!ret) {
- __clocksource_update_freq_hz(cs, ch->rate);
+ if (!ret)
ch->cs_enabled = true;
- }
return ret;
}
@@ -334,8 +331,7 @@ static int sh_tmu_register_clocksource(struct sh_tmu_channel *ch,
dev_info(&ch->tmu->pdev->dev, "ch%u: used as clock source\n",
ch->index);
- /* Register with dummy 1 Hz value, gets updated in ->enable() */
- clocksource_register_hz(cs, 1);
+ clocksource_register_hz(cs, ch->tmu->rate);
return 0;
}
@@ -346,14 +342,10 @@ static struct sh_tmu_channel *ced_to_sh_tmu(struct clock_event_device *ced)
static void sh_tmu_clock_event_start(struct sh_tmu_channel *ch, int periodic)
{
- struct clock_event_device *ced = &ch->ced;
-
sh_tmu_enable(ch);
- clockevents_config(ced, ch->rate);
-
if (periodic) {
- ch->periodic = (ch->rate + HZ/2) / HZ;
+ ch->periodic = (ch->tmu->rate + HZ/2) / HZ;
sh_tmu_set_next(ch, ch->periodic, 1);
}
}
@@ -435,7 +427,7 @@ static void sh_tmu_register_clockevent(struct sh_tmu_channel *ch,
dev_info(&ch->tmu->pdev->dev, "ch%u: used for clock events\n",
ch->index);
- clockevents_config_and_register(ced, 1, 0x300, 0xffffffff);
+ clockevents_config_and_register(ced, ch->tmu->rate, 0x300, 0xffffffff);
ret = request_irq(ch->irq, sh_tmu_interrupt,
IRQF_TIMER | IRQF_IRQPOLL | IRQF_NOBALANCING,
@@ -561,6 +553,14 @@ static int sh_tmu_setup(struct sh_tmu_device *tmu, struct platform_device *pdev)
if (ret < 0)
goto err_clk_put;
+ /* Determine clock rate. */
+ ret = clk_enable(tmu->clk);
+ if (ret < 0)
+ goto err_clk_unprepare;
+
+ tmu->rate = clk_get_rate(tmu->clk) / 4;
+ clk_disable(tmu->clk);
+
/* Map the memory resource. */
ret = sh_tmu_map_memory(tmu);
if (ret < 0) {
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index ad67342313ed..0329d319d89a 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -9,6 +9,7 @@
#
KASAN_SANITIZE_runtime-wrappers.o := n
+obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o
obj-$(CONFIG_EFI) += efi.o vars.o reboot.o memattr.o
obj-$(CONFIG_EFI) += capsule.o memmap.o
obj-$(CONFIG_EFI_VARS) += efivars.o
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/drivers/firmware/efi/efi-bgrt.c
index 04ca8764f0c0..04ca8764f0c0 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/drivers/firmware/efi/efi-bgrt.c
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index 93d8cdbe7ef4..6371e83933f3 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -254,9 +254,9 @@ static int efi_pstore_write(struct pstore_record *record)
for (i = 0; i < DUMP_NAME_LEN; i++)
efi_name[i] = name[i];
- efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
- !pstore_cannot_block_path(record->reason),
- record->size, record->psi->buf);
+ ret = efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
+ !pstore_cannot_block_path(record->reason),
+ record->size, record->psi->buf);
if (record->reason == KMSG_DUMP_OOPS)
efivar_run_worker();
diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index d4056c6be1ec..1e45ec51b094 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -18,7 +18,21 @@
#include "efistub.h"
-bool __nokaslr;
+/*
+ * This is the base address at which to start allocating virtual memory ranges
+ * for UEFI Runtime Services. This is in the low TTBR0 range so that we can use
+ * any allocation we choose, and eliminate the risk of a conflict after kexec.
+ * The value chosen is the largest non-zero power of 2 suitable for this purpose
+ * both on 32-bit and 64-bit ARM CPUs, to maximize the likelihood that it can
+ * be mapped efficiently.
+ * Since 32-bit ARM could potentially execute with a 1G/3G user/kernel split,
+ * map everything below 1 GB. (512 MB is a reasonable upper bound for the
+ * entire footprint of the UEFI runtime services memory regions)
+ */
+#define EFI_RT_VIRTUAL_BASE SZ_512M
+#define EFI_RT_VIRTUAL_SIZE SZ_512M
+
+static u64 virtmap_base = EFI_RT_VIRTUAL_BASE;
efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg,
void *__image, void **__fh)
@@ -118,8 +132,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
goto fail;
- pr_efi(sys_table, "Booting Linux Kernel...\n");
-
status = check_platform_features(sys_table);
if (status != EFI_SUCCESS)
goto fail;
@@ -153,17 +165,15 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
goto fail;
}
- /* check whether 'nokaslr' was passed on the command line */
- if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
- static const u8 default_cmdline[] = CONFIG_CMDLINE;
- const u8 *str, *cmdline = cmdline_ptr;
+ if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
+ IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
+ cmdline_size == 0)
+ efi_parse_options(CONFIG_CMDLINE);
- if (IS_ENABLED(CONFIG_CMDLINE_FORCE))
- cmdline = default_cmdline;
- str = strstr(cmdline, "nokaslr");
- if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
- __nokaslr = true;
- }
+ if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0)
+ efi_parse_options(cmdline_ptr);
+
+ pr_efi(sys_table, "Booting Linux Kernel...\n");
si = setup_graphics(sys_table);
@@ -176,10 +186,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
goto fail_free_cmdline;
}
- status = efi_parse_options(cmdline_ptr);
- if (status != EFI_SUCCESS)
- pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n");
-
secure_boot = efi_get_secureboot(sys_table);
/*
@@ -213,8 +219,9 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
if (!fdt_addr)
pr_efi(sys_table, "Generating empty DTB\n");
- status = handle_cmdline_files(sys_table, image, cmdline_ptr,
- "initrd=", dram_base + SZ_512M,
+ status = handle_cmdline_files(sys_table, image, cmdline_ptr, "initrd=",
+ efi_get_max_initrd_addr(dram_base,
+ *image_addr),
(unsigned long *)&initrd_addr,
(unsigned long *)&initrd_size);
if (status != EFI_SUCCESS)
@@ -222,9 +229,28 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
efi_random_get_seed(sys_table);
+ if (!nokaslr()) {
+ /*
+ * Randomize the base of the UEFI runtime services region.
+ * Preserve the 2 MB alignment of the region by taking a
+ * shift of 21 bit positions into account when scaling
+ * the headroom value using a 32-bit random value.
+ */
+ u64 headroom = TASK_SIZE - EFI_RT_VIRTUAL_BASE -
+ EFI_RT_VIRTUAL_SIZE;
+ u32 rnd;
+
+ status = efi_get_random_bytes(sys_table, sizeof(rnd),
+ (u8 *)&rnd);
+ if (status == EFI_SUCCESS) {
+ virtmap_base = EFI_RT_VIRTUAL_BASE +
+ (((headroom >> 21) * rnd) >> (32 - 21));
+ }
+ }
+
new_fdt_addr = fdt_addr;
status = allocate_new_fdt_and_exit_boot(sys_table, handle,
- &new_fdt_addr, dram_base + MAX_FDT_OFFSET,
+ &new_fdt_addr, efi_get_max_fdt_addr(dram_base),
initrd_addr, initrd_size, cmdline_ptr,
fdt_addr, fdt_size);
@@ -251,18 +277,6 @@ fail:
return EFI_ERROR;
}
-/*
- * This is the base address at which to start allocating virtual memory ranges
- * for UEFI Runtime Services. This is in the low TTBR0 range so that we can use
- * any allocation we choose, and eliminate the risk of a conflict after kexec.
- * The value chosen is the largest non-zero power of 2 suitable for this purpose
- * both on 32-bit and 64-bit ARM CPUs, to maximize the likelihood that it can
- * be mapped efficiently.
- * Since 32-bit ARM could potentially execute with a 1G/3G user/kernel split,
- * map everything below 1 GB.
- */
-#define EFI_RT_VIRTUAL_BASE SZ_512M
-
static int cmp_mem_desc(const void *l, const void *r)
{
const efi_memory_desc_t *left = l, *right = r;
@@ -312,7 +326,7 @@ void efi_get_virtmap(efi_memory_desc_t *memory_map, unsigned long map_size,
unsigned long desc_size, efi_memory_desc_t *runtime_map,
int *count)
{
- u64 efi_virt_base = EFI_RT_VIRTUAL_BASE;
+ u64 efi_virt_base = virtmap_base;
efi_memory_desc_t *in, *prev = NULL, *out = runtime_map;
int l;
diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
index e1f0b28e1dcb..becbda445913 100644
--- a/drivers/firmware/efi/libstub/arm32-stub.c
+++ b/drivers/firmware/efi/libstub/arm32-stub.c
@@ -9,6 +9,8 @@
#include <linux/efi.h>
#include <asm/efi.h>
+#include "efistub.h"
+
efi_status_t check_platform_features(efi_system_table_t *sys_table_arg)
{
int block;
@@ -63,6 +65,132 @@ void free_screen_info(efi_system_table_t *sys_table_arg, struct screen_info *si)
efi_call_early(free_pool, si);
}
+static efi_status_t reserve_kernel_base(efi_system_table_t *sys_table_arg,
+ unsigned long dram_base,
+ unsigned long *reserve_addr,
+ unsigned long *reserve_size)
+{
+ efi_physical_addr_t alloc_addr;
+ efi_memory_desc_t *memory_map;
+ unsigned long nr_pages, map_size, desc_size, buff_size;
+ efi_status_t status;
+ unsigned long l;
+
+ struct efi_boot_memmap map = {
+ .map = &memory_map,
+ .map_size = &map_size,
+ .desc_size = &desc_size,
+ .desc_ver = NULL,
+ .key_ptr = NULL,
+ .buff_size = &buff_size,
+ };
+
+ /*
+ * Reserve memory for the uncompressed kernel image. This is
+ * all that prevents any future allocations from conflicting
+ * with the kernel. Since we can't tell from the compressed
+ * image how much DRAM the kernel actually uses (due to BSS
+ * size uncertainty) we allocate the maximum possible size.
+ * Do this very early, as prints can cause memory allocations
+ * that may conflict with this.
+ */
+ alloc_addr = dram_base + MAX_UNCOMP_KERNEL_SIZE;
+ nr_pages = MAX_UNCOMP_KERNEL_SIZE / EFI_PAGE_SIZE;
+ status = efi_call_early(allocate_pages, EFI_ALLOCATE_MAX_ADDRESS,
+ EFI_BOOT_SERVICES_DATA, nr_pages, &alloc_addr);
+ if (status == EFI_SUCCESS) {
+ if (alloc_addr == dram_base) {
+ *reserve_addr = alloc_addr;
+ *reserve_size = MAX_UNCOMP_KERNEL_SIZE;
+ return EFI_SUCCESS;
+ }
+ /*
+ * If we end up here, the allocation succeeded but starts below
+ * dram_base. This can only occur if the real base of DRAM is
+ * not a multiple of 128 MB, in which case dram_base will have
+ * been rounded up. Since this implies that a part of the region
+ * was already occupied, we need to fall through to the code
+ * below to ensure that the existing allocations don't conflict.
+ * For this reason, we use EFI_BOOT_SERVICES_DATA above and not
+ * EFI_LOADER_DATA, which we wouldn't able to distinguish from
+ * allocations that we want to disallow.
+ */
+ }
+
+ /*
+ * If the allocation above failed, we may still be able to proceed:
+ * if the only allocations in the region are of types that will be
+ * released to the OS after ExitBootServices(), the decompressor can
+ * safely overwrite them.
+ */
+ status = efi_get_memory_map(sys_table_arg, &map);
+ if (status != EFI_SUCCESS) {
+ pr_efi_err(sys_table_arg,
+ "reserve_kernel_base(): Unable to retrieve memory map.\n");
+ return status;
+ }
+
+ for (l = 0; l < map_size; l += desc_size) {
+ efi_memory_desc_t *desc;
+ u64 start, end;
+
+ desc = (void *)memory_map + l;
+ start = desc->phys_addr;
+ end = start + desc->num_pages * EFI_PAGE_SIZE;
+
+ /* Skip if entry does not intersect with region */
+ if (start >= dram_base + MAX_UNCOMP_KERNEL_SIZE ||
+ end <= dram_base)
+ continue;
+
+ switch (desc->type) {
+ case EFI_BOOT_SERVICES_CODE:
+ case EFI_BOOT_SERVICES_DATA:
+ /* Ignore types that are released to the OS anyway */
+ continue;
+
+ case EFI_CONVENTIONAL_MEMORY:
+ /*
+ * Reserve the intersection between this entry and the
+ * region.
+ */
+ start = max(start, (u64)dram_base);
+ end = min(end, (u64)dram_base + MAX_UNCOMP_KERNEL_SIZE);
+
+ status = efi_call_early(allocate_pages,
+ EFI_ALLOCATE_ADDRESS,
+ EFI_LOADER_DATA,
+ (end - start) / EFI_PAGE_SIZE,
+ &start);
+ if (status != EFI_SUCCESS) {
+ pr_efi_err(sys_table_arg,
+ "reserve_kernel_base(): alloc failed.\n");
+ goto out;
+ }
+ break;
+
+ case EFI_LOADER_CODE:
+ case EFI_LOADER_DATA:
+ /*
+ * These regions may be released and reallocated for
+ * another purpose (including EFI_RUNTIME_SERVICE_DATA)
+ * at any time during the execution of the OS loader,
+ * so we cannot consider them as safe.
+ */
+ default:
+ /*
+ * Treat any other allocation in the region as unsafe */
+ status = EFI_OUT_OF_RESOURCES;
+ goto out;
+ }
+ }
+
+ status = EFI_SUCCESS;
+out:
+ efi_call_early(free_pool, memory_map);
+ return status;
+}
+
efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
unsigned long *image_addr,
unsigned long *image_size,
@@ -71,10 +199,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
unsigned long dram_base,
efi_loaded_image_t *image)
{
- unsigned long nr_pages;
efi_status_t status;
- /* Use alloc_addr to tranlsate between types */
- efi_physical_addr_t alloc_addr;
/*
* Verify that the DRAM base address is compatible with the ARM
@@ -85,27 +210,12 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table,
*/
dram_base = round_up(dram_base, SZ_128M);
- /*
- * Reserve memory for the uncompressed kernel image. This is
- * all that prevents any future allocations from conflicting
- * with the kernel. Since we can't tell from the compressed
- * image how much DRAM the kernel actually uses (due to BSS
- * size uncertainty) we allocate the maximum possible size.
- * Do this very early, as prints can cause memory allocations
- * that may conflict with this.
- */
- alloc_addr = dram_base;
- *reserve_size = MAX_UNCOMP_KERNEL_SIZE;
- nr_pages = round_up(*reserve_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
- status = sys_table->boottime->allocate_pages(EFI_ALLOCATE_ADDRESS,
- EFI_LOADER_DATA,
- nr_pages, &alloc_addr);
+ status = reserve_kernel_base(sys_table, dram_base, reserve_addr,
+ reserve_size);
if (status != EFI_SUCCESS) {
- *reserve_size = 0;
pr_efi_err(sys_table, "Unable to allocate memory for uncompressed kernel.\n");
return status;
}
- *reserve_addr = alloc_addr;
/*
* Relocate the zImage, so that it appears in the lowest 128 MB
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index eae693eb3e91..b4c2589d7c91 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -16,8 +16,6 @@
#include "efistub.h"
-extern bool __nokaslr;
-
efi_status_t check_platform_features(efi_system_table_t *sys_table_arg)
{
u64 tg;
@@ -52,7 +50,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table_arg,
u64 phys_seed = 0;
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
- if (!__nokaslr) {
+ if (!nokaslr()) {
status = efi_get_random_bytes(sys_table_arg,
sizeof(phys_seed),
(u8 *)&phys_seed);
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 919822b7773d..b0184360efc6 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -32,6 +32,18 @@
static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
+static int __section(.data) __nokaslr;
+static int __section(.data) __quiet;
+
+int __pure nokaslr(void)
+{
+ return __nokaslr;
+}
+int __pure is_quiet(void)
+{
+ return __quiet;
+}
+
#define EFI_MMAP_NR_SLACK_SLOTS 8
struct file_info {
@@ -409,17 +421,17 @@ static efi_status_t efi_file_close(void *handle)
* environments, first in the early boot environment of the EFI boot
* stub, and subsequently during the kernel boot.
*/
-efi_status_t efi_parse_options(char *cmdline)
+efi_status_t efi_parse_options(char const *cmdline)
{
char *str;
- /*
- * Currently, the only efi= option we look for is 'nochunk', which
- * is intended to work around known issues on certain x86 UEFI
- * versions. So ignore for now on other architectures.
- */
- if (!IS_ENABLED(CONFIG_X86))
- return EFI_SUCCESS;
+ str = strstr(cmdline, "nokaslr");
+ if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
+ __nokaslr = 1;
+
+ str = strstr(cmdline, "quiet");
+ if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
+ __quiet = 1;
/*
* If no EFI parameters were specified on the cmdline we've got
@@ -436,14 +448,14 @@ efi_status_t efi_parse_options(char *cmdline)
* Remember, because efi= is also used by the kernel we need to
* skip over arguments we don't understand.
*/
- while (*str) {
+ while (*str && *str != ' ') {
if (!strncmp(str, "nochunk", 7)) {
str += strlen("nochunk");
__chunk_size = -1UL;
}
/* Group words together, delimited by "," */
- while (*str && *str != ',')
+ while (*str && *str != ' ' && *str != ',')
str++;
if (*str == ',')
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 71c4d0e3c4ed..83f268c05007 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -24,6 +24,15 @@
#define EFI_ALLOC_ALIGN EFI_PAGE_SIZE
#endif
+extern int __pure nokaslr(void);
+extern int __pure is_quiet(void);
+
+#define pr_efi(sys_table, msg) do { \
+ if (!is_quiet()) efi_printk(sys_table, "EFI stub: "msg); \
+} while (0)
+
+#define pr_efi_err(sys_table, msg) efi_printk(sys_table, "EFI stub: ERROR: "msg)
+
void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image,
diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c
index 82973b86efe4..8830fa601e45 100644
--- a/drivers/firmware/efi/libstub/fdt.c
+++ b/drivers/firmware/efi/libstub/fdt.c
@@ -230,6 +230,10 @@ static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg,
return update_fdt_memmap(p->new_fdt_addr, map);
}
+#ifndef MAX_FDT_SIZE
+#define MAX_FDT_SIZE SZ_2M
+#endif
+
/*
* Allocate memory for a new FDT, then add EFI, commandline, and
* initrd related fields to the FDT. This routine increases the
@@ -257,7 +261,6 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
u32 desc_ver;
unsigned long mmap_key;
efi_memory_desc_t *memory_map, *runtime_map;
- unsigned long new_fdt_size;
efi_status_t status;
int runtime_entry_count = 0;
struct efi_boot_memmap map;
@@ -286,41 +289,29 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
"Exiting boot services and installing virtual address map...\n");
map.map = &memory_map;
+ status = efi_high_alloc(sys_table, MAX_FDT_SIZE, EFI_FDT_ALIGN,
+ new_fdt_addr, max_addr);
+ if (status != EFI_SUCCESS) {
+ pr_efi_err(sys_table,
+ "Unable to allocate memory for new device tree.\n");
+ goto fail;
+ }
+
/*
- * Estimate size of new FDT, and allocate memory for it. We
- * will allocate a bigger buffer if this ends up being too
- * small, so a rough guess is OK here.
+ * Now that we have done our final memory allocation (and free)
+ * we can get the memory map key needed for exit_boot_services().
*/
- new_fdt_size = fdt_size + EFI_PAGE_SIZE;
- while (1) {
- status = efi_high_alloc(sys_table, new_fdt_size, EFI_FDT_ALIGN,
- new_fdt_addr, max_addr);
- if (status != EFI_SUCCESS) {
- pr_efi_err(sys_table, "Unable to allocate memory for new device tree.\n");
- goto fail;
- }
-
- status = update_fdt(sys_table,
- (void *)fdt_addr, fdt_size,
- (void *)*new_fdt_addr, new_fdt_size,
- cmdline_ptr, initrd_addr, initrd_size);
+ status = efi_get_memory_map(sys_table, &map);
+ if (status != EFI_SUCCESS)
+ goto fail_free_new_fdt;
- /* Succeeding the first time is the expected case. */
- if (status == EFI_SUCCESS)
- break;
+ status = update_fdt(sys_table, (void *)fdt_addr, fdt_size,
+ (void *)*new_fdt_addr, MAX_FDT_SIZE, cmdline_ptr,
+ initrd_addr, initrd_size);
- if (status == EFI_BUFFER_TOO_SMALL) {
- /*
- * We need to allocate more space for the new
- * device tree, so free existing buffer that is
- * too small.
- */
- efi_free(sys_table, new_fdt_size, *new_fdt_addr);
- new_fdt_size += EFI_PAGE_SIZE;
- } else {
- pr_efi_err(sys_table, "Unable to construct new device tree.\n");
- goto fail_free_new_fdt;
- }
+ if (status != EFI_SUCCESS) {
+ pr_efi_err(sys_table, "Unable to construct new device tree.\n");
+ goto fail_free_new_fdt;
}
priv.runtime_map = runtime_map;
@@ -364,7 +355,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(efi_system_table_t *sys_table,
pr_efi_err(sys_table, "Exit boot services failed.\n");
fail_free_new_fdt:
- efi_free(sys_table, new_fdt_size, *new_fdt_addr);
+ efi_free(sys_table, MAX_FDT_SIZE, *new_fdt_addr);
fail:
sys_table->boottime->free_pool(runtime_map);
diff --git a/drivers/firmware/efi/libstub/gop.c b/drivers/firmware/efi/libstub/gop.c
index 932742e4cf23..24c461dea7af 100644
--- a/drivers/firmware/efi/libstub/gop.c
+++ b/drivers/firmware/efi/libstub/gop.c
@@ -149,7 +149,8 @@ setup_gop32(efi_system_table_t *sys_table_arg, struct screen_info *si,
status = __gop_query32(sys_table_arg, gop32, &info, &size,
&current_fb_base);
- if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
+ if (status == EFI_SUCCESS && (!first_gop || conout_found) &&
+ info->pixel_format != PIXEL_BLT_ONLY) {
/*
* Systems that use the UEFI Console Splitter may
* provide multiple GOP devices, not all of which are
@@ -266,7 +267,8 @@ setup_gop64(efi_system_table_t *sys_table_arg, struct screen_info *si,
status = __gop_query64(sys_table_arg, gop64, &info, &size,
&current_fb_base);
- if (status == EFI_SUCCESS && (!first_gop || conout_found)) {
+ if (status == EFI_SUCCESS && (!first_gop || conout_found) &&
+ info->pixel_format != PIXEL_BLT_ONLY) {
/*
* Systems that use the UEFI Console Splitter may
* provide multiple GOP devices, not all of which are
diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
index 5da36e56b36a..8c34d50a4d80 100644
--- a/drivers/firmware/efi/libstub/secureboot.c
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -12,6 +12,8 @@
#include <linux/efi.h>
#include <asm/efi.h>
+#include "efistub.h"
+
/* BIOS variables */
static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
static const efi_char16_t const efi_SecureBoot_name[] = {
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 0403b51d20ba..c29cd5387a35 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -7,6 +7,9 @@ config HYPERV
Select this option to run Linux as a Hyper-V client operating
system.
+config HYPERV_TSCPAGE
+ def_bool HYPERV && X86_64
+
config HYPERV_UTILS
tristate "Microsoft Hyper-V Utilities driver"
depends on HYPERV && CONNECTOR && NLS
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index d7325c6534ad..979ea6ec7902 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -321,7 +321,7 @@ static int etb_set_buffer(struct coresight_device *csdev,
static unsigned long etb_reset_buffer(struct coresight_device *csdev,
struct perf_output_handle *handle,
- void *sink_config, bool *lost)
+ void *sink_config)
{
unsigned long size = 0;
struct cs_buffers *buf = sink_config;
@@ -343,7 +343,6 @@ static unsigned long etb_reset_buffer(struct coresight_device *csdev,
* resetting parameters here and squaring off with the ring
* buffer API in the tracer PMU is fine.
*/
- *lost = !!local_xchg(&buf->lost, 0);
size = local_xchg(&buf->data_size, 0);
}
@@ -385,7 +384,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
(unsigned long)write_ptr);
write_ptr &= ~(ETB_FRAME_SIZE_WORDS - 1);
- local_inc(&buf->lost);
+ perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
}
/*
@@ -396,7 +395,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
*/
status = readl_relaxed(drvdata->base + ETB_STATUS_REG);
if (status & ETB_STATUS_RAM_FULL) {
- local_inc(&buf->lost);
+ perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
to_read = capacity;
read_ptr = write_ptr;
} else {
@@ -429,7 +428,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
if (read_ptr > (drvdata->buffer_depth - 1))
read_ptr -= drvdata->buffer_depth;
/* let the decoder know we've skipped ahead */
- local_inc(&buf->lost);
+ perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
}
/* finally tell HW where we want to start reading from */
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 26cfac3e6de7..288a423c1b27 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -302,7 +302,8 @@ out:
return;
fail_end_stop:
- perf_aux_output_end(handle, 0, true);
+ perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+ perf_aux_output_end(handle, 0);
fail:
event->hw.state = PERF_HES_STOPPED;
goto out;
@@ -310,7 +311,6 @@ fail:
static void etm_event_stop(struct perf_event *event, int mode)
{
- bool lost;
int cpu = smp_processor_id();
unsigned long size;
struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
@@ -348,10 +348,9 @@ static void etm_event_stop(struct perf_event *event, int mode)
return;
size = sink_ops(sink)->reset_buffer(sink, handle,
- event_data->snk_config,
- &lost);
+ event_data->snk_config);
- perf_aux_output_end(handle, size, lost);
+ perf_aux_output_end(handle, size);
}
/* Disabling the path make its elements available to other sessions */
diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h
index ef9d8e93e3b2..5f662d82052c 100644
--- a/drivers/hwtracing/coresight/coresight-priv.h
+++ b/drivers/hwtracing/coresight/coresight-priv.h
@@ -76,7 +76,6 @@ enum cs_mode {
* @nr_pages: max number of pages granted to us
* @offset: offset within the current buffer
* @data_size: how much we collected in this run
- * @lost: other than zero if we had a HW buffer wrap around
* @snapshot: is this run in snapshot mode
* @data_pages: a handle the ring buffer
*/
@@ -85,7 +84,6 @@ struct cs_buffers {
unsigned int nr_pages;
unsigned long offset;
local_t data_size;
- local_t lost;
bool snapshot;
void **data_pages;
};
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index 1549436e2492..aec61a6d5c63 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -329,7 +329,7 @@ static int tmc_set_etf_buffer(struct coresight_device *csdev,
static unsigned long tmc_reset_etf_buffer(struct coresight_device *csdev,
struct perf_output_handle *handle,
- void *sink_config, bool *lost)
+ void *sink_config)
{
long size = 0;
struct cs_buffers *buf = sink_config;
@@ -350,7 +350,6 @@ static unsigned long tmc_reset_etf_buffer(struct coresight_device *csdev,
* resetting parameters here and squaring off with the ring
* buffer API in the tracer PMU is fine.
*/
- *lost = !!local_xchg(&buf->lost, 0);
size = local_xchg(&buf->data_size, 0);
}
@@ -389,7 +388,7 @@ static void tmc_update_etf_buffer(struct coresight_device *csdev,
*/
status = readl_relaxed(drvdata->base + TMC_STS);
if (status & TMC_STS_FULL) {
- local_inc(&buf->lost);
+ perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
to_read = drvdata->size;
} else {
to_read = CIRC_CNT(write_ptr, read_ptr, drvdata->size);
@@ -434,7 +433,7 @@ static void tmc_update_etf_buffer(struct coresight_device *csdev,
read_ptr -= drvdata->size;
/* Tell the HW */
writel_relaxed(read_ptr, drvdata->base + TMC_RRP);
- local_inc(&buf->lost);
+ perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
}
cur = buf->cur;
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index b17536d6e69b..63cacf5d6cf2 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1234,7 +1234,7 @@ static void __domain_flush_pages(struct protection_domain *domain,
build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
- for (i = 0; i < amd_iommus_present; ++i) {
+ for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
if (!domain->dev_iommu[i])
continue;
@@ -1278,7 +1278,7 @@ static void domain_flush_complete(struct protection_domain *domain)
{
int i;
- for (i = 0; i < amd_iommus_present; ++i) {
+ for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
if (domain && !domain->dev_iommu[i])
continue;
@@ -3363,7 +3363,7 @@ static int __flush_pasid(struct protection_domain *domain, int pasid,
* IOMMU TLB needs to be flushed before Device TLB to
* prevent device TLB refill from IOMMU TLB
*/
- for (i = 0; i < amd_iommus_present; ++i) {
+ for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
if (domain->dev_iommu[i] == 0)
continue;
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 6130278c5d71..5a11328f4d98 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -167,7 +167,9 @@ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
/* Array to assign indices to IOMMUs*/
struct amd_iommu *amd_iommus[MAX_IOMMUS];
-int amd_iommus_present;
+
+/* Number of IOMMUs present in the system */
+static int amd_iommus_present;
/* IOMMUs have a non-present cache? */
bool amd_iommu_np_cache __read_mostly;
@@ -254,10 +256,6 @@ static int amd_iommu_enable_interrupts(void);
static int __init iommu_go_to_state(enum iommu_init_state state);
static void init_device_table_dma(void);
-static int iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
- u8 bank, u8 cntr, u8 fxn,
- u64 *value, bool is_write);
-
static inline void update_last_devid(u16 devid)
{
if (devid > amd_iommu_last_bdf)
@@ -272,6 +270,11 @@ static inline unsigned long tbl_size(int entry_size)
return 1UL << shift;
}
+int amd_iommu_get_num_iommus(void)
+{
+ return amd_iommus_present;
+}
+
/* Access to l1 and l2 indexed register spaces */
static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
@@ -1336,7 +1339,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
/* Add IOMMU to internal data structures */
list_add_tail(&iommu->list, &amd_iommu_list);
- iommu->index = amd_iommus_present++;
+ iommu->index = amd_iommus_present++;
if (unlikely(iommu->index >= MAX_IOMMUS)) {
WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
@@ -1477,6 +1480,8 @@ static int __init init_iommu_all(struct acpi_table_header *table)
return 0;
}
+static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+ u8 fxn, u64 *value, bool is_write);
static void init_iommu_perf_ctr(struct amd_iommu *iommu)
{
@@ -1488,8 +1493,8 @@ static void init_iommu_perf_ctr(struct amd_iommu *iommu)
amd_iommu_pc_present = true;
/* Check if the performance counters can be written to */
- if ((0 != iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val, true)) ||
- (0 != iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val2, false)) ||
+ if ((iommu_pc_get_set_reg(iommu, 0, 0, 0, &val, true)) ||
+ (iommu_pc_get_set_reg(iommu, 0, 0, 0, &val2, false)) ||
(val != val2)) {
pr_err("AMD-Vi: Unable to write to IOMMU perf counter.\n");
amd_iommu_pc_present = false;
@@ -2711,6 +2716,18 @@ bool amd_iommu_v2_supported(void)
}
EXPORT_SYMBOL(amd_iommu_v2_supported);
+struct amd_iommu *get_amd_iommu(unsigned int idx)
+{
+ unsigned int i = 0;
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ if (i++ == idx)
+ return iommu;
+ return NULL;
+}
+EXPORT_SYMBOL(get_amd_iommu);
+
/****************************************************************************
*
* IOMMU EFR Performance Counter support functionality. This code allows
@@ -2718,17 +2735,14 @@ EXPORT_SYMBOL(amd_iommu_v2_supported);
*
****************************************************************************/
-u8 amd_iommu_pc_get_max_banks(u16 devid)
+u8 amd_iommu_pc_get_max_banks(unsigned int idx)
{
- struct amd_iommu *iommu;
- u8 ret = 0;
+ struct amd_iommu *iommu = get_amd_iommu(idx);
- /* locate the iommu governing the devid */
- iommu = amd_iommu_rlookup_table[devid];
if (iommu)
- ret = iommu->max_banks;
+ return iommu->max_banks;
- return ret;
+ return 0;
}
EXPORT_SYMBOL(amd_iommu_pc_get_max_banks);
@@ -2738,62 +2752,69 @@ bool amd_iommu_pc_supported(void)
}
EXPORT_SYMBOL(amd_iommu_pc_supported);
-u8 amd_iommu_pc_get_max_counters(u16 devid)
+u8 amd_iommu_pc_get_max_counters(unsigned int idx)
{
- struct amd_iommu *iommu;
- u8 ret = 0;
+ struct amd_iommu *iommu = get_amd_iommu(idx);
- /* locate the iommu governing the devid */
- iommu = amd_iommu_rlookup_table[devid];
if (iommu)
- ret = iommu->max_counters;
+ return iommu->max_counters;
- return ret;
+ return 0;
}
EXPORT_SYMBOL(amd_iommu_pc_get_max_counters);
-static int iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
- u8 bank, u8 cntr, u8 fxn,
- u64 *value, bool is_write)
+static int iommu_pc_get_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr,
+ u8 fxn, u64 *value, bool is_write)
{
u32 offset;
u32 max_offset_lim;
+ /* Make sure the IOMMU PC resource is available */
+ if (!amd_iommu_pc_present)
+ return -ENODEV;
+
/* Check for valid iommu and pc register indexing */
- if (WARN_ON((fxn > 0x28) || (fxn & 7)))
+ if (WARN_ON(!iommu || (fxn > 0x28) || (fxn & 7)))
return -ENODEV;
- offset = (u32)(((0x40|bank) << 12) | (cntr << 8) | fxn);
+ offset = (u32)(((0x40 | bank) << 12) | (cntr << 8) | fxn);
/* Limit the offset to the hw defined mmio region aperture */
- max_offset_lim = (u32)(((0x40|iommu->max_banks) << 12) |
+ max_offset_lim = (u32)(((0x40 | iommu->max_banks) << 12) |
(iommu->max_counters << 8) | 0x28);
if ((offset < MMIO_CNTR_REG_OFFSET) ||
(offset > max_offset_lim))
return -EINVAL;
if (is_write) {
- writel((u32)*value, iommu->mmio_base + offset);
- writel((*value >> 32), iommu->mmio_base + offset + 4);
+ u64 val = *value & GENMASK_ULL(47, 0);
+
+ writel((u32)val, iommu->mmio_base + offset);
+ writel((val >> 32), iommu->mmio_base + offset + 4);
} else {
*value = readl(iommu->mmio_base + offset + 4);
*value <<= 32;
- *value = readl(iommu->mmio_base + offset);
+ *value |= readl(iommu->mmio_base + offset);
+ *value &= GENMASK_ULL(47, 0);
}
return 0;
}
-EXPORT_SYMBOL(amd_iommu_pc_get_set_reg_val);
-int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
- u64 *value, bool is_write)
+int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value)
{
- struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ return -EINVAL;
- /* Make sure the IOMMU PC resource is available */
- if (!amd_iommu_pc_present || iommu == NULL)
- return -ENODEV;
+ return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, false);
+}
+EXPORT_SYMBOL(amd_iommu_pc_get_reg);
+
+int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, u8 fxn, u64 *value)
+{
+ if (!iommu)
+ return -EINVAL;
- return iommu_pc_get_set_reg_val(iommu, bank, cntr, fxn,
- value, is_write);
+ return iommu_pc_get_set_reg(iommu, bank, cntr, fxn, value, true);
}
+EXPORT_SYMBOL(amd_iommu_pc_set_reg);
diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h
index 7eb60c15c582..466260f8a1df 100644
--- a/drivers/iommu/amd_iommu_proto.h
+++ b/drivers/iommu/amd_iommu_proto.h
@@ -21,6 +21,7 @@
#include "amd_iommu_types.h"
+extern int amd_iommu_get_num_iommus(void);
extern int amd_iommu_init_dma_ops(void);
extern int amd_iommu_init_passthrough(void);
extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
@@ -56,13 +57,6 @@ extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid,
extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid);
extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev);
-/* IOMMU Performance Counter functions */
-extern bool amd_iommu_pc_supported(void);
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
- u64 *value, bool is_write);
-
#ifdef CONFIG_IRQ_REMAP
extern int amd_iommu_create_irq_domain(struct amd_iommu *iommu);
#else
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index 003f3ceb2661..4de8f4160bb8 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -611,9 +611,6 @@ extern struct list_head amd_iommu_list;
*/
extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
-/* Number of IOMMUs present in the system */
-extern int amd_iommus_present;
-
/*
* Declarations for the global list of all protection domains
*/
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index d71f6323ac00..b4f79b923aea 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -504,7 +504,7 @@ void __init lguest_arch_host_init(void)
* byte, not the size, hence the "-1").
*/
state->host_gdt_desc.size = GDT_SIZE-1;
- state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
+ state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);
/*
* All CPUs on the Host use the same Interrupt Descriptor
@@ -554,8 +554,8 @@ void __init lguest_arch_host_init(void)
* The Host needs to be able to use the LGUEST segments on this
* CPU, too, so put them in the Host GDT.
*/
- get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
- get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+ get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+ get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
}
/*
diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c
index 438d4c72c7b3..ff563db025b3 100644
--- a/drivers/pnp/pnpbios/bioscalls.c
+++ b/drivers/pnp/pnpbios/bioscalls.c
@@ -54,7 +54,7 @@ __asm__(".text \n"
#define Q2_SET_SEL(cpu, selname, address, size) \
do { \
- struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \
+ struct desc_struct *gdt = get_cpu_gdt_rw((cpu)); \
set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \
set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \
} while(0)
@@ -95,8 +95,8 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
return PNP_FUNCTION_NOT_SUPPORTED;
cpu = get_cpu();
- save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8];
- get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc;
+ save_desc_40 = get_cpu_gdt_rw(cpu)[0x40 / 8];
+ get_cpu_gdt_rw(cpu)[0x40 / 8] = bad_bios_desc;
/* On some boxes IRQ's during PnP BIOS calls are deadly. */
spin_lock_irqsave(&pnp_bios_lock, flags);
@@ -134,7 +134,7 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
:"memory");
spin_unlock_irqrestore(&pnp_bios_lock, flags);
- get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40;
+ get_cpu_gdt_rw(cpu)[0x40 / 8] = save_desc_40;
put_cpu();
/* If we get here and this is set then the PnP BIOS faulted on us. */
@@ -477,7 +477,7 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header)
pnp_bios_callpoint.segment = PNP_CS16;
for_each_possible_cpu(i) {
- struct desc_struct *gdt = get_cpu_gdt_table(i);
+ struct desc_struct *gdt = get_cpu_gdt_rw(i);
if (!gdt)
continue;
set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32],
diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile
index d7f73341ced3..7b26dd3aa5d0 100644
--- a/drivers/ras/Makefile
+++ b/drivers/ras/Makefile
@@ -1 +1,2 @@
-obj-$(CONFIG_RAS) += ras.o debugfs.o
+obj-$(CONFIG_RAS) += ras.o debugfs.o
+obj-$(CONFIG_RAS_CEC) += cec.o
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
new file mode 100644
index 000000000000..6aab46d91d33
--- /dev/null
+++ b/drivers/ras/cec.c
@@ -0,0 +1,532 @@
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+
+#include <asm/mce.h>
+
+#include "debugfs.h"
+
+/*
+ * RAS Correctable Errors Collector
+ *
+ * This is a simple gadget which collects correctable errors and counts their
+ * occurrence per physical page address.
+ *
+ * We've opted for possibly the simplest data structure to collect those - an
+ * array of the size of a memory page. It stores 512 u64's with the following
+ * structure:
+ *
+ * [63 ... PFN ... 12 | 11 ... generation ... 10 | 9 ... count ... 0]
+ *
+ * The generation in the two highest order bits is two bits which are set to 11b
+ * on every insertion. During the course of each entry's existence, the
+ * generation field gets decremented during spring cleaning to 10b, then 01b and
+ * then 00b.
+ *
+ * This way we're employing the natural numeric ordering to make sure that newly
+ * inserted/touched elements have higher 12-bit counts (which we've manufactured)
+ * and thus iterating over the array initially won't kick out those elements
+ * which were inserted last.
+ *
+ * Spring cleaning is what we do when we reach a certain number CLEAN_ELEMS of
+ * elements entered into the array, during which, we're decaying all elements.
+ * If, after decay, an element gets inserted again, its generation is set to 11b
+ * to make sure it has higher numerical count than other, older elements and
+ * thus emulate an an LRU-like behavior when deleting elements to free up space
+ * in the page.
+ *
+ * When an element reaches it's max count of count_threshold, we try to poison
+ * it by assuming that errors triggered count_threshold times in a single page
+ * are excessive and that page shouldn't be used anymore. count_threshold is
+ * initialized to COUNT_MASK which is the maximum.
+ *
+ * That error event entry causes cec_add_elem() to return !0 value and thus
+ * signal to its callers to log the error.
+ *
+ * To the question why we've chosen a page and moving elements around with
+ * memmove(), it is because it is a very simple structure to handle and max data
+ * movement is 4K which on highly optimized modern CPUs is almost unnoticeable.
+ * We wanted to avoid the pointer traversal of more complex structures like a
+ * linked list or some sort of a balancing search tree.
+ *
+ * Deleting an element takes O(n) but since it is only a single page, it should
+ * be fast enough and it shouldn't happen all too often depending on error
+ * patterns.
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "RAS: " fmt
+
+/*
+ * We use DECAY_BITS bits of PAGE_SHIFT bits for counting decay, i.e., how long
+ * elements have stayed in the array without having been accessed again.
+ */
+#define DECAY_BITS 2
+#define DECAY_MASK ((1ULL << DECAY_BITS) - 1)
+#define MAX_ELEMS (PAGE_SIZE / sizeof(u64))
+
+/*
+ * Threshold amount of inserted elements after which we start spring
+ * cleaning.
+ */
+#define CLEAN_ELEMS (MAX_ELEMS >> DECAY_BITS)
+
+/* Bits which count the number of errors happened in this 4K page. */
+#define COUNT_BITS (PAGE_SHIFT - DECAY_BITS)
+#define COUNT_MASK ((1ULL << COUNT_BITS) - 1)
+#define FULL_COUNT_MASK (PAGE_SIZE - 1)
+
+/*
+ * u64: [ 63 ... 12 | DECAY_BITS | COUNT_BITS ]
+ */
+
+#define PFN(e) ((e) >> PAGE_SHIFT)
+#define DECAY(e) (((e) >> COUNT_BITS) & DECAY_MASK)
+#define COUNT(e) ((unsigned int)(e) & COUNT_MASK)
+#define FULL_COUNT(e) ((e) & (PAGE_SIZE - 1))
+
+static struct ce_array {
+ u64 *array; /* container page */
+ unsigned int n; /* number of elements in the array */
+
+ unsigned int decay_count; /*
+ * number of element insertions/increments
+ * since the last spring cleaning.
+ */
+
+ u64 pfns_poisoned; /*
+ * number of PFNs which got poisoned.
+ */
+
+ u64 ces_entered; /*
+ * The number of correctable errors
+ * entered into the collector.
+ */
+
+ u64 decays_done; /*
+ * Times we did spring cleaning.
+ */
+
+ union {
+ struct {
+ __u32 disabled : 1, /* cmdline disabled */
+ __resv : 31;
+ };
+ __u32 flags;
+ };
+} ce_arr;
+
+static DEFINE_MUTEX(ce_mutex);
+static u64 dfs_pfn;
+
+/* Amount of errors after which we offline */
+static unsigned int count_threshold = COUNT_MASK;
+
+/*
+ * The timer "decays" element count each timer_interval which is 24hrs by
+ * default.
+ */
+
+#define CEC_TIMER_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */
+#define CEC_TIMER_MIN_INTERVAL 1 * 60 * 60 /* 1h */
+#define CEC_TIMER_MAX_INTERVAL 30 * 24 * 60 * 60 /* one month */
+static struct timer_list cec_timer;
+static u64 timer_interval = CEC_TIMER_DEFAULT_INTERVAL;
+
+/*
+ * Decrement decay value. We're using DECAY_BITS bits to denote decay of an
+ * element in the array. On insertion and any access, it gets reset to max.
+ */
+static void do_spring_cleaning(struct ce_array *ca)
+{
+ int i;
+
+ for (i = 0; i < ca->n; i++) {
+ u8 decay = DECAY(ca->array[i]);
+
+ if (!decay)
+ continue;
+
+ decay--;
+
+ ca->array[i] &= ~(DECAY_MASK << COUNT_BITS);
+ ca->array[i] |= (decay << COUNT_BITS);
+ }
+ ca->decay_count = 0;
+ ca->decays_done++;
+}
+
+/*
+ * @interval in seconds
+ */
+static void cec_mod_timer(struct timer_list *t, unsigned long interval)
+{
+ unsigned long iv;
+
+ iv = interval * HZ + jiffies;
+
+ mod_timer(t, round_jiffies(iv));
+}
+
+static void cec_timer_fn(unsigned long data)
+{
+ struct ce_array *ca = (struct ce_array *)data;
+
+ do_spring_cleaning(ca);
+
+ cec_mod_timer(&cec_timer, timer_interval);
+}
+
+/*
+ * @to: index of the smallest element which is >= then @pfn.
+ *
+ * Return the index of the pfn if found, otherwise negative value.
+ */
+static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
+{
+ u64 this_pfn;
+ int min = 0, max = ca->n;
+
+ while (min < max) {
+ int tmp = (max + min) >> 1;
+
+ this_pfn = PFN(ca->array[tmp]);
+
+ if (this_pfn < pfn)
+ min = tmp + 1;
+ else if (this_pfn > pfn)
+ max = tmp;
+ else {
+ min = tmp;
+ break;
+ }
+ }
+
+ if (to)
+ *to = min;
+
+ this_pfn = PFN(ca->array[min]);
+
+ if (this_pfn == pfn)
+ return min;
+
+ return -ENOKEY;
+}
+
+static int find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
+{
+ WARN_ON(!to);
+
+ if (!ca->n) {
+ *to = 0;
+ return -ENOKEY;
+ }
+ return __find_elem(ca, pfn, to);
+}
+
+static void del_elem(struct ce_array *ca, int idx)
+{
+ /* Save us a function call when deleting the last element. */
+ if (ca->n - (idx + 1))
+ memmove((void *)&ca->array[idx],
+ (void *)&ca->array[idx + 1],
+ (ca->n - (idx + 1)) * sizeof(u64));
+
+ ca->n--;
+}
+
+static u64 del_lru_elem_unlocked(struct ce_array *ca)
+{
+ unsigned int min = FULL_COUNT_MASK;
+ int i, min_idx = 0;
+
+ for (i = 0; i < ca->n; i++) {
+ unsigned int this = FULL_COUNT(ca->array[i]);
+
+ if (min > this) {
+ min = this;
+ min_idx = i;
+ }
+ }
+
+ del_elem(ca, min_idx);
+
+ return PFN(ca->array[min_idx]);
+}
+
+/*
+ * We return the 0th pfn in the error case under the assumption that it cannot
+ * be poisoned and excessive CEs in there are a serious deal anyway.
+ */
+static u64 __maybe_unused del_lru_elem(void)
+{
+ struct ce_array *ca = &ce_arr;
+ u64 pfn;
+
+ if (!ca->n)
+ return 0;
+
+ mutex_lock(&ce_mutex);
+ pfn = del_lru_elem_unlocked(ca);
+ mutex_unlock(&ce_mutex);
+
+ return pfn;
+}
+
+
+int cec_add_elem(u64 pfn)
+{
+ struct ce_array *ca = &ce_arr;
+ unsigned int to;
+ int count, ret = 0;
+
+ /*
+ * We can be called very early on the identify_cpu() path where we are
+ * not initialized yet. We ignore the error for simplicity.
+ */
+ if (!ce_arr.array || ce_arr.disabled)
+ return -ENODEV;
+
+ ca->ces_entered++;
+
+ mutex_lock(&ce_mutex);
+
+ if (ca->n == MAX_ELEMS)
+ WARN_ON(!del_lru_elem_unlocked(ca));
+
+ ret = find_elem(ca, pfn, &to);
+ if (ret < 0) {
+ /*
+ * Shift range [to-end] to make room for one more element.
+ */
+ memmove((void *)&ca->array[to + 1],
+ (void *)&ca->array[to],
+ (ca->n - to) * sizeof(u64));
+
+ ca->array[to] = (pfn << PAGE_SHIFT) |
+ (DECAY_MASK << COUNT_BITS) | 1;
+
+ ca->n++;
+
+ ret = 0;
+
+ goto decay;
+ }
+
+ count = COUNT(ca->array[to]);
+
+ if (count < count_threshold) {
+ ca->array[to] |= (DECAY_MASK << COUNT_BITS);
+ ca->array[to]++;
+
+ ret = 0;
+ } else {
+ u64 pfn = ca->array[to] >> PAGE_SHIFT;
+
+ if (!pfn_valid(pfn)) {
+ pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
+ } else {
+ /* We have reached max count for this page, soft-offline it. */
+ pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
+ memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
+ ca->pfns_poisoned++;
+ }
+
+ del_elem(ca, to);
+
+ /*
+ * Return a >0 value to denote that we've reached the offlining
+ * threshold.
+ */
+ ret = 1;
+
+ goto unlock;
+ }
+
+decay:
+ ca->decay_count++;
+
+ if (ca->decay_count >= CLEAN_ELEMS)
+ do_spring_cleaning(ca);
+
+unlock:
+ mutex_unlock(&ce_mutex);
+
+ return ret;
+}
+
+static int u64_get(void *data, u64 *val)
+{
+ *val = *(u64 *)data;
+
+ return 0;
+}
+
+static int pfn_set(void *data, u64 val)
+{
+ *(u64 *)data = val;
+
+ return cec_add_elem(val);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
+
+static int decay_interval_set(void *data, u64 val)
+{
+ *(u64 *)data = val;
+
+ if (val < CEC_TIMER_MIN_INTERVAL)
+ return -EINVAL;
+
+ if (val > CEC_TIMER_MAX_INTERVAL)
+ return -EINVAL;
+
+ timer_interval = val;
+
+ cec_mod_timer(&cec_timer, timer_interval);
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
+
+static int count_threshold_set(void *data, u64 val)
+{
+ *(u64 *)data = val;
+
+ if (val > COUNT_MASK)
+ val = COUNT_MASK;
+
+ count_threshold = val;
+
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n");
+
+static int array_dump(struct seq_file *m, void *v)
+{
+ struct ce_array *ca = &ce_arr;
+ u64 prev = 0;
+ int i;
+
+ mutex_lock(&ce_mutex);
+
+ seq_printf(m, "{ n: %d\n", ca->n);
+ for (i = 0; i < ca->n; i++) {
+ u64 this = PFN(ca->array[i]);
+
+ seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
+
+ WARN_ON(prev > this);
+
+ prev = this;
+ }
+
+ seq_printf(m, "}\n");
+
+ seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n",
+ ca->ces_entered, ca->pfns_poisoned);
+
+ seq_printf(m, "Flags: 0x%x\n", ca->flags);
+
+ seq_printf(m, "Timer interval: %lld seconds\n", timer_interval);
+ seq_printf(m, "Decays: %lld\n", ca->decays_done);
+
+ seq_printf(m, "Action threshold: %d\n", count_threshold);
+
+ mutex_unlock(&ce_mutex);
+
+ return 0;
+}
+
+static int array_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, array_dump, NULL);
+}
+
+static const struct file_operations array_ops = {
+ .owner = THIS_MODULE,
+ .open = array_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init create_debugfs_nodes(void)
+{
+ struct dentry *d, *pfn, *decay, *count, *array;
+
+ d = debugfs_create_dir("cec", ras_debugfs_dir);
+ if (!d) {
+ pr_warn("Error creating cec debugfs node!\n");
+ return -1;
+ }
+
+ pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
+ if (!pfn) {
+ pr_warn("Error creating pfn debugfs node!\n");
+ goto err;
+ }
+
+ array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
+ if (!array) {
+ pr_warn("Error creating array debugfs node!\n");
+ goto err;
+ }
+
+ decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d,
+ &timer_interval, &decay_interval_ops);
+ if (!decay) {
+ pr_warn("Error creating decay_interval debugfs node!\n");
+ goto err;
+ }
+
+ count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
+ &count_threshold, &count_threshold_ops);
+ if (!decay) {
+ pr_warn("Error creating count_threshold debugfs node!\n");
+ goto err;
+ }
+
+
+ return 0;
+
+err:
+ debugfs_remove_recursive(d);
+
+ return 1;
+}
+
+void __init cec_init(void)
+{
+ if (ce_arr.disabled)
+ return;
+
+ ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!ce_arr.array) {
+ pr_err("Error allocating CE array page!\n");
+ return;
+ }
+
+ if (create_debugfs_nodes())
+ return;
+
+ setup_timer(&cec_timer, cec_timer_fn, (unsigned long)&ce_arr);
+ cec_mod_timer(&cec_timer, CEC_TIMER_DEFAULT_INTERVAL);
+
+ pr_info("Correctable Errors collector initialized.\n");
+}
+
+int __init parse_cec_param(char *str)
+{
+ if (!str)
+ return 0;
+
+ if (*str == '=')
+ str++;
+
+ if (!strncmp(str, "cec_disable", 7))
+ ce_arr.disabled = 1;
+ else
+ return 0;
+
+ return 1;
+}
diff --git a/drivers/ras/debugfs.c b/drivers/ras/debugfs.c
index 0322acf67ea5..501603057dff 100644
--- a/drivers/ras/debugfs.c
+++ b/drivers/ras/debugfs.c
@@ -1,6 +1,6 @@
#include <linux/debugfs.h>
-static struct dentry *ras_debugfs_dir;
+struct dentry *ras_debugfs_dir;
static atomic_t trace_count = ATOMIC_INIT(0);
diff --git a/drivers/ras/debugfs.h b/drivers/ras/debugfs.h
new file mode 100644
index 000000000000..db72e4513191
--- /dev/null
+++ b/drivers/ras/debugfs.h
@@ -0,0 +1,8 @@
+#ifndef __RAS_DEBUGFS_H__
+#define __RAS_DEBUGFS_H__
+
+#include <linux/debugfs.h>
+
+extern struct dentry *ras_debugfs_dir;
+
+#endif /* __RAS_DEBUGFS_H__ */
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index b67dd362b7b6..94f8038864b4 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -27,3 +27,14 @@ subsys_initcall(ras_init);
EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
+
+
+int __init parse_ras_param(char *str)
+{
+#ifdef CONFIG_RAS_CEC
+ parse_cec_param(str);
+#endif
+
+ return 1;
+}
+__setup("ras", parse_ras_param);
diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index 7791af6c102c..53d1356bbd06 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile
@@ -49,7 +49,7 @@ obj-$(CONFIG_USB_MICROTEK) += image/
obj-$(CONFIG_USB_SERIAL) += serial/
obj-$(CONFIG_USB) += misc/
-obj-$(CONFIG_EARLY_PRINTK_DBGP) += early/
+obj-$(CONFIG_EARLY_PRINTK_USB) += early/
obj-$(CONFIG_USB_ATM) += atm/
obj-$(CONFIG_USB_SPEEDTOUCH) += atm/
diff --git a/drivers/usb/early/Makefile b/drivers/usb/early/Makefile
index 24bbe519c737..fcde2286da1c 100644
--- a/drivers/usb/early/Makefile
+++ b/drivers/usb/early/Makefile
@@ -3,3 +3,4 @@
#
obj-$(CONFIG_EARLY_PRINTK_DBGP) += ehci-dbgp.o
+obj-$(CONFIG_EARLY_PRINTK_USB_XDBC) += xhci-dbc.o
diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c
new file mode 100644
index 000000000000..1268818e2263
--- /dev/null
+++ b/drivers/usb/early/xhci-dbc.c
@@ -0,0 +1,1014 @@
+/**
+ * xhci-dbc.c - xHCI debug capability early driver
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/console.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/bootmem.h>
+#include <linux/io.h>
+#include <asm/pci-direct.h>
+#include <asm/fixmap.h>
+#include <linux/bcd.h>
+#include <linux/export.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+
+#include "../host/xhci.h"
+#include "xhci-dbc.h"
+
+static struct xdbc_state xdbc;
+static bool early_console_keep;
+
+#define XDBC_TRACE
+#ifdef XDBC_TRACE
+#define xdbc_trace trace_printk
+#else
+static inline void xdbc_trace(const char *fmt, ...) { }
+#endif /* XDBC_TRACE */
+
+static void __iomem * __init xdbc_map_pci_mmio(u32 bus, u32 dev, u32 func)
+{
+ u64 val64, sz64, mask64;
+ void __iomem *base;
+ u32 val, sz;
+ u8 byte;
+
+ val = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0);
+ write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0, ~0);
+ sz = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0);
+ write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0, val);
+
+ if (val == 0xffffffff || sz == 0xffffffff) {
+ pr_notice("invalid mmio bar\n");
+ return NULL;
+ }
+
+ val64 = val & PCI_BASE_ADDRESS_MEM_MASK;
+ sz64 = sz & PCI_BASE_ADDRESS_MEM_MASK;
+ mask64 = PCI_BASE_ADDRESS_MEM_MASK;
+
+ if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+ val = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4);
+ write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4, ~0);
+ sz = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4);
+ write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4, val);
+
+ val64 |= (u64)val << 32;
+ sz64 |= (u64)sz << 32;
+ mask64 |= ~0ULL << 32;
+ }
+
+ sz64 &= mask64;
+
+ if (!sz64) {
+ pr_notice("invalid mmio address\n");
+ return NULL;
+ }
+
+ sz64 = 1ULL << __ffs64(sz64);
+
+ /* Check if the mem space is enabled: */
+ byte = read_pci_config_byte(bus, dev, func, PCI_COMMAND);
+ if (!(byte & PCI_COMMAND_MEMORY)) {
+ byte |= PCI_COMMAND_MEMORY;
+ write_pci_config_byte(bus, dev, func, PCI_COMMAND, byte);
+ }
+
+ xdbc.xhci_start = val64;
+ xdbc.xhci_length = sz64;
+ base = early_ioremap(val64, sz64);
+
+ return base;
+}
+
+static void * __init xdbc_get_page(dma_addr_t *dma_addr)
+{
+ void *virt;
+
+ virt = alloc_bootmem_pages_nopanic(PAGE_SIZE);
+ if (!virt)
+ return NULL;
+
+ if (dma_addr)
+ *dma_addr = (dma_addr_t)__pa(virt);
+
+ return virt;
+}
+
+static u32 __init xdbc_find_dbgp(int xdbc_num, u32 *b, u32 *d, u32 *f)
+{
+ u32 bus, dev, func, class;
+
+ for (bus = 0; bus < XDBC_PCI_MAX_BUSES; bus++) {
+ for (dev = 0; dev < XDBC_PCI_MAX_DEVICES; dev++) {
+ for (func = 0; func < XDBC_PCI_MAX_FUNCTION; func++) {
+
+ class = read_pci_config(bus, dev, func, PCI_CLASS_REVISION);
+ if ((class >> 8) != PCI_CLASS_SERIAL_USB_XHCI)
+ continue;
+
+ if (xdbc_num-- != 0)
+ continue;
+
+ *b = bus;
+ *d = dev;
+ *f = func;
+
+ return 0;
+ }
+ }
+ }
+
+ return -1;
+}
+
+static int handshake(void __iomem *ptr, u32 mask, u32 done, int wait, int delay)
+{
+ u32 result;
+
+ do {
+ result = readl(ptr);
+ result &= mask;
+ if (result == done)
+ return 0;
+ udelay(delay);
+ wait -= delay;
+ } while (wait > 0);
+
+ return -ETIMEDOUT;
+}
+
+static void __init xdbc_bios_handoff(void)
+{
+ int offset, timeout;
+ u32 val;
+
+ offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_LEGACY);
+ val = readl(xdbc.xhci_base + offset);
+
+ if (val & XHCI_HC_BIOS_OWNED) {
+ writel(val | XHCI_HC_OS_OWNED, xdbc.xhci_base + offset);
+ timeout = handshake(xdbc.xhci_base + offset, XHCI_HC_BIOS_OWNED, 0, 5000, 10);
+
+ if (timeout) {
+ pr_notice("failed to hand over xHCI control from BIOS\n");
+ writel(val & ~XHCI_HC_BIOS_OWNED, xdbc.xhci_base + offset);
+ }
+ }
+
+ /* Disable BIOS SMIs and clear all SMI events: */
+ val = readl(xdbc.xhci_base + offset + XHCI_LEGACY_CONTROL_OFFSET);
+ val &= XHCI_LEGACY_DISABLE_SMI;
+ val |= XHCI_LEGACY_SMI_EVENTS;
+ writel(val, xdbc.xhci_base + offset + XHCI_LEGACY_CONTROL_OFFSET);
+}
+
+static int __init
+xdbc_alloc_ring(struct xdbc_segment *seg, struct xdbc_ring *ring)
+{
+ seg->trbs = xdbc_get_page(&seg->dma);
+ if (!seg->trbs)
+ return -ENOMEM;
+
+ ring->segment = seg;
+
+ return 0;
+}
+
+static void __init xdbc_free_ring(struct xdbc_ring *ring)
+{
+ struct xdbc_segment *seg = ring->segment;
+
+ if (!seg)
+ return;
+
+ free_bootmem(seg->dma, PAGE_SIZE);
+ ring->segment = NULL;
+}
+
+static void xdbc_reset_ring(struct xdbc_ring *ring)
+{
+ struct xdbc_segment *seg = ring->segment;
+ struct xdbc_trb *link_trb;
+
+ memset(seg->trbs, 0, PAGE_SIZE);
+
+ ring->enqueue = seg->trbs;
+ ring->dequeue = seg->trbs;
+ ring->cycle_state = 1;
+
+ if (ring != &xdbc.evt_ring) {
+ link_trb = &seg->trbs[XDBC_TRBS_PER_SEGMENT - 1];
+ link_trb->field[0] = cpu_to_le32(lower_32_bits(seg->dma));
+ link_trb->field[1] = cpu_to_le32(upper_32_bits(seg->dma));
+ link_trb->field[3] = cpu_to_le32(TRB_TYPE(TRB_LINK)) | cpu_to_le32(LINK_TOGGLE);
+ }
+}
+
+static inline void xdbc_put_utf16(u16 *s, const char *c, size_t size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ s[i] = cpu_to_le16(c[i]);
+}
+
+static void xdbc_mem_init(void)
+{
+ struct xdbc_ep_context *ep_in, *ep_out;
+ struct usb_string_descriptor *s_desc;
+ struct xdbc_erst_entry *entry;
+ struct xdbc_strings *strings;
+ struct xdbc_context *ctx;
+ unsigned int max_burst;
+ u32 string_length;
+ int index = 0;
+ u32 dev_info;
+
+ xdbc_reset_ring(&xdbc.evt_ring);
+ xdbc_reset_ring(&xdbc.in_ring);
+ xdbc_reset_ring(&xdbc.out_ring);
+ memset(xdbc.table_base, 0, PAGE_SIZE);
+ memset(xdbc.out_buf, 0, PAGE_SIZE);
+
+ /* Initialize event ring segment table: */
+ xdbc.erst_size = 16;
+ xdbc.erst_base = xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE;
+ xdbc.erst_dma = xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE;
+
+ index += XDBC_ERST_ENTRY_NUM;
+ entry = (struct xdbc_erst_entry *)xdbc.erst_base;
+
+ entry->seg_addr = cpu_to_le64(xdbc.evt_seg.dma);
+ entry->seg_size = cpu_to_le32(XDBC_TRBS_PER_SEGMENT);
+ entry->__reserved_0 = 0;
+
+ /* Initialize ERST registers: */
+ writel(1, &xdbc.xdbc_reg->ersts);
+ xdbc_write64(xdbc.erst_dma, &xdbc.xdbc_reg->erstba);
+ xdbc_write64(xdbc.evt_seg.dma, &xdbc.xdbc_reg->erdp);
+
+ /* Debug capability contexts: */
+ xdbc.dbcc_size = 64 * 3;
+ xdbc.dbcc_base = xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE;
+ xdbc.dbcc_dma = xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE;
+
+ index += XDBC_DBCC_ENTRY_NUM;
+
+ /* Popluate the strings: */
+ xdbc.string_size = sizeof(struct xdbc_strings);
+ xdbc.string_base = xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE;
+ xdbc.string_dma = xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE;
+ strings = (struct xdbc_strings *)xdbc.string_base;
+
+ index += XDBC_STRING_ENTRY_NUM;
+
+ /* Serial string: */
+ s_desc = (struct usb_string_descriptor *)strings->serial;
+ s_desc->bLength = (strlen(XDBC_STRING_SERIAL) + 1) * 2;
+ s_desc->bDescriptorType = USB_DT_STRING;
+
+ xdbc_put_utf16(s_desc->wData, XDBC_STRING_SERIAL, strlen(XDBC_STRING_SERIAL));
+ string_length = s_desc->bLength;
+ string_length <<= 8;
+
+ /* Product string: */
+ s_desc = (struct usb_string_descriptor *)strings->product;
+ s_desc->bLength = (strlen(XDBC_STRING_PRODUCT) + 1) * 2;
+ s_desc->bDescriptorType = USB_DT_STRING;
+
+ xdbc_put_utf16(s_desc->wData, XDBC_STRING_PRODUCT, strlen(XDBC_STRING_PRODUCT));
+ string_length += s_desc->bLength;
+ string_length <<= 8;
+
+ /* Manufacture string: */
+ s_desc = (struct usb_string_descriptor *)strings->manufacturer;
+ s_desc->bLength = (strlen(XDBC_STRING_MANUFACTURER) + 1) * 2;
+ s_desc->bDescriptorType = USB_DT_STRING;
+
+ xdbc_put_utf16(s_desc->wData, XDBC_STRING_MANUFACTURER, strlen(XDBC_STRING_MANUFACTURER));
+ string_length += s_desc->bLength;
+ string_length <<= 8;
+
+ /* String0: */
+ strings->string0[0] = 4;
+ strings->string0[1] = USB_DT_STRING;
+ strings->string0[2] = 0x09;
+ strings->string0[3] = 0x04;
+
+ string_length += 4;
+
+ /* Populate info Context: */
+ ctx = (struct xdbc_context *)xdbc.dbcc_base;
+
+ ctx->info.string0 = cpu_to_le64(xdbc.string_dma);
+ ctx->info.manufacturer = cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH);
+ ctx->info.product = cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH * 2);
+ ctx->info.serial = cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH * 3);
+ ctx->info.length = cpu_to_le32(string_length);
+
+ /* Populate bulk out endpoint context: */
+ max_burst = DEBUG_MAX_BURST(readl(&xdbc.xdbc_reg->control));
+ ep_out = (struct xdbc_ep_context *)&ctx->out;
+
+ ep_out->ep_info1 = 0;
+ ep_out->ep_info2 = cpu_to_le32(EP_TYPE(BULK_OUT_EP) | MAX_PACKET(1024) | MAX_BURST(max_burst));
+ ep_out->deq = cpu_to_le64(xdbc.out_seg.dma | xdbc.out_ring.cycle_state);
+
+ /* Populate bulk in endpoint context: */
+ ep_in = (struct xdbc_ep_context *)&ctx->in;
+
+ ep_in->ep_info1 = 0;
+ ep_in->ep_info2 = cpu_to_le32(EP_TYPE(BULK_OUT_EP) | MAX_PACKET(1024) | MAX_BURST(max_burst));
+ ep_in->deq = cpu_to_le64(xdbc.in_seg.dma | xdbc.in_ring.cycle_state);
+
+ /* Set DbC context and info registers: */
+ xdbc_write64(xdbc.dbcc_dma, &xdbc.xdbc_reg->dccp);
+
+ dev_info = cpu_to_le32((XDBC_VENDOR_ID << 16) | XDBC_PROTOCOL);
+ writel(dev_info, &xdbc.xdbc_reg->devinfo1);
+
+ dev_info = cpu_to_le32((XDBC_DEVICE_REV << 16) | XDBC_PRODUCT_ID);
+ writel(dev_info, &xdbc.xdbc_reg->devinfo2);
+
+ xdbc.in_buf = xdbc.out_buf + XDBC_MAX_PACKET;
+ xdbc.in_dma = xdbc.out_dma + XDBC_MAX_PACKET;
+}
+
+static void xdbc_do_reset_debug_port(u32 id, u32 count)
+{
+ void __iomem *ops_reg;
+ void __iomem *portsc;
+ u32 val, cap_length;
+ int i;
+
+ cap_length = readl(xdbc.xhci_base) & 0xff;
+ ops_reg = xdbc.xhci_base + cap_length;
+
+ id--;
+ for (i = id; i < (id + count); i++) {
+ portsc = ops_reg + 0x400 + i * 0x10;
+ val = readl(portsc);
+ if (!(val & PORT_CONNECT))
+ writel(val | PORT_RESET, portsc);
+ }
+}
+
+static void xdbc_reset_debug_port(void)
+{
+ u32 val, port_offset, port_count;
+ int offset = 0;
+
+ do {
+ offset = xhci_find_next_ext_cap(xdbc.xhci_base, offset, XHCI_EXT_CAPS_PROTOCOL);
+ if (!offset)
+ break;
+
+ val = readl(xdbc.xhci_base + offset);
+ if (XHCI_EXT_PORT_MAJOR(val) != 0x3)
+ continue;
+
+ val = readl(xdbc.xhci_base + offset + 8);
+ port_offset = XHCI_EXT_PORT_OFF(val);
+ port_count = XHCI_EXT_PORT_COUNT(val);
+
+ xdbc_do_reset_debug_port(port_offset, port_count);
+ } while (1);
+}
+
+static void
+xdbc_queue_trb(struct xdbc_ring *ring, u32 field1, u32 field2, u32 field3, u32 field4)
+{
+ struct xdbc_trb *trb, *link_trb;
+
+ trb = ring->enqueue;
+ trb->field[0] = cpu_to_le32(field1);
+ trb->field[1] = cpu_to_le32(field2);
+ trb->field[2] = cpu_to_le32(field3);
+ trb->field[3] = cpu_to_le32(field4);
+
+ ++(ring->enqueue);
+ if (ring->enqueue >= &ring->segment->trbs[TRBS_PER_SEGMENT - 1]) {
+ link_trb = ring->enqueue;
+ if (ring->cycle_state)
+ link_trb->field[3] |= cpu_to_le32(TRB_CYCLE);
+ else
+ link_trb->field[3] &= cpu_to_le32(~TRB_CYCLE);
+
+ ring->enqueue = ring->segment->trbs;
+ ring->cycle_state ^= 1;
+ }
+}
+
+static void xdbc_ring_doorbell(int target)
+{
+ writel(DOOR_BELL_TARGET(target), &xdbc.xdbc_reg->doorbell);
+}
+
+static int xdbc_start(void)
+{
+ u32 ctrl, status;
+ int ret;
+
+ ctrl = readl(&xdbc.xdbc_reg->control);
+ writel(ctrl | CTRL_DBC_ENABLE | CTRL_PORT_ENABLE, &xdbc.xdbc_reg->control);
+ ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, CTRL_DBC_ENABLE, 100000, 100);
+ if (ret) {
+ xdbc_trace("failed to initialize hardware\n");
+ return ret;
+ }
+
+ /* Reset port to avoid bus hang: */
+ if (xdbc.vendor == PCI_VENDOR_ID_INTEL)
+ xdbc_reset_debug_port();
+
+ /* Wait for port connection: */
+ ret = handshake(&xdbc.xdbc_reg->portsc, PORTSC_CONN_STATUS, PORTSC_CONN_STATUS, 5000000, 100);
+ if (ret) {
+ xdbc_trace("waiting for connection timed out\n");
+ return ret;
+ }
+
+ /* Wait for debug device to be configured: */
+ ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_RUN, CTRL_DBC_RUN, 5000000, 100);
+ if (ret) {
+ xdbc_trace("waiting for device configuration timed out\n");
+ return ret;
+ }
+
+ /* Check port number: */
+ status = readl(&xdbc.xdbc_reg->status);
+ if (!DCST_DEBUG_PORT(status)) {
+ xdbc_trace("invalid root hub port number\n");
+ return -ENODEV;
+ }
+
+ xdbc.port_number = DCST_DEBUG_PORT(status);
+
+ xdbc_trace("DbC is running now, control 0x%08x port ID %d\n",
+ readl(&xdbc.xdbc_reg->control), xdbc.port_number);
+
+ return 0;
+}
+
+static int xdbc_bulk_transfer(void *data, int size, bool read)
+{
+ struct xdbc_ring *ring;
+ struct xdbc_trb *trb;
+ u32 length, control;
+ u32 cycle;
+ u64 addr;
+
+ if (size > XDBC_MAX_PACKET) {
+ xdbc_trace("bad parameter, size %d\n", size);
+ return -EINVAL;
+ }
+
+ if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED) ||
+ !(xdbc.flags & XDBC_FLAGS_CONFIGURED) ||
+ (!read && (xdbc.flags & XDBC_FLAGS_OUT_STALL)) ||
+ (read && (xdbc.flags & XDBC_FLAGS_IN_STALL))) {
+
+ xdbc_trace("connection not ready, flags %08x\n", xdbc.flags);
+ return -EIO;
+ }
+
+ ring = (read ? &xdbc.in_ring : &xdbc.out_ring);
+ trb = ring->enqueue;
+ cycle = ring->cycle_state;
+ length = TRB_LEN(size);
+ control = TRB_TYPE(TRB_NORMAL) | TRB_IOC;
+
+ if (cycle)
+ control &= cpu_to_le32(~TRB_CYCLE);
+ else
+ control |= cpu_to_le32(TRB_CYCLE);
+
+ if (read) {
+ memset(xdbc.in_buf, 0, XDBC_MAX_PACKET);
+ addr = xdbc.in_dma;
+ xdbc.flags |= XDBC_FLAGS_IN_PROCESS;
+ } else {
+ memset(xdbc.out_buf, 0, XDBC_MAX_PACKET);
+ memcpy(xdbc.out_buf, data, size);
+ addr = xdbc.out_dma;
+ xdbc.flags |= XDBC_FLAGS_OUT_PROCESS;
+ }
+
+ xdbc_queue_trb(ring, lower_32_bits(addr), upper_32_bits(addr), length, control);
+
+ /*
+ * Add a barrier between writes of trb fields and flipping
+ * the cycle bit:
+ */
+ wmb();
+ if (cycle)
+ trb->field[3] |= cpu_to_le32(cycle);
+ else
+ trb->field[3] &= cpu_to_le32(~TRB_CYCLE);
+
+ xdbc_ring_doorbell(read ? IN_EP_DOORBELL : OUT_EP_DOORBELL);
+
+ return size;
+}
+
+static int xdbc_handle_external_reset(void)
+{
+ int ret = 0;
+
+ xdbc.flags = 0;
+ writel(0, &xdbc.xdbc_reg->control);
+ ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, 0, 100000, 10);
+ if (ret)
+ goto reset_out;
+
+ xdbc_mem_init();
+
+ mmiowb();
+
+ ret = xdbc_start();
+ if (ret < 0)
+ goto reset_out;
+
+ xdbc_trace("dbc recovered\n");
+
+ xdbc.flags |= XDBC_FLAGS_INITIALIZED | XDBC_FLAGS_CONFIGURED;
+
+ xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+
+ return 0;
+
+reset_out:
+ xdbc_trace("failed to recover from external reset\n");
+ return ret;
+}
+
+static int __init xdbc_early_setup(void)
+{
+ int ret;
+
+ writel(0, &xdbc.xdbc_reg->control);
+ ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, 0, 100000, 100);
+ if (ret)
+ return ret;
+
+ /* Allocate the table page: */
+ xdbc.table_base = xdbc_get_page(&xdbc.table_dma);
+ if (!xdbc.table_base)
+ return -ENOMEM;
+
+ /* Get and store the transfer buffer: */
+ xdbc.out_buf = xdbc_get_page(&xdbc.out_dma);
+ if (!xdbc.out_buf)
+ return -ENOMEM;
+
+ /* Allocate the event ring: */
+ ret = xdbc_alloc_ring(&xdbc.evt_seg, &xdbc.evt_ring);
+ if (ret < 0)
+ return ret;
+
+ /* Allocate IN/OUT endpoint transfer rings: */
+ ret = xdbc_alloc_ring(&xdbc.in_seg, &xdbc.in_ring);
+ if (ret < 0)
+ return ret;
+
+ ret = xdbc_alloc_ring(&xdbc.out_seg, &xdbc.out_ring);
+ if (ret < 0)
+ return ret;
+
+ xdbc_mem_init();
+
+ mmiowb();
+
+ ret = xdbc_start();
+ if (ret < 0) {
+ writel(0, &xdbc.xdbc_reg->control);
+ return ret;
+ }
+
+ xdbc.flags |= XDBC_FLAGS_INITIALIZED | XDBC_FLAGS_CONFIGURED;
+
+ xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+
+ return 0;
+}
+
+int __init early_xdbc_parse_parameter(char *s)
+{
+ unsigned long dbgp_num = 0;
+ u32 bus, dev, func, offset;
+ int ret;
+
+ if (!early_pci_allowed())
+ return -EPERM;
+
+ if (strstr(s, "keep"))
+ early_console_keep = true;
+
+ if (xdbc.xdbc_reg)
+ return 0;
+
+ if (*s && kstrtoul(s, 0, &dbgp_num))
+ dbgp_num = 0;
+
+ pr_notice("dbgp_num: %lu\n", dbgp_num);
+
+ /* Locate the host controller: */
+ ret = xdbc_find_dbgp(dbgp_num, &bus, &dev, &func);
+ if (ret) {
+ pr_notice("failed to locate xhci host\n");
+ return -ENODEV;
+ }
+
+ xdbc.vendor = read_pci_config_16(bus, dev, func, PCI_VENDOR_ID);
+ xdbc.device = read_pci_config_16(bus, dev, func, PCI_DEVICE_ID);
+ xdbc.bus = bus;
+ xdbc.dev = dev;
+ xdbc.func = func;
+
+ /* Map the IO memory: */
+ xdbc.xhci_base = xdbc_map_pci_mmio(bus, dev, func);
+ if (!xdbc.xhci_base)
+ return -EINVAL;
+
+ /* Locate DbC registers: */
+ offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_DEBUG);
+ if (!offset) {
+ pr_notice("xhci host doesn't support debug capability\n");
+ early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
+ xdbc.xhci_base = NULL;
+ xdbc.xhci_length = 0;
+
+ return -ENODEV;
+ }
+ xdbc.xdbc_reg = (struct xdbc_regs __iomem *)(xdbc.xhci_base + offset);
+
+ return 0;
+}
+
+int __init early_xdbc_setup_hardware(void)
+{
+ int ret;
+
+ if (!xdbc.xdbc_reg)
+ return -ENODEV;
+
+ xdbc_bios_handoff();
+
+ raw_spin_lock_init(&xdbc.lock);
+
+ ret = xdbc_early_setup();
+ if (ret) {
+ pr_notice("failed to setup the connection to host\n");
+
+ xdbc_free_ring(&xdbc.evt_ring);
+ xdbc_free_ring(&xdbc.out_ring);
+ xdbc_free_ring(&xdbc.in_ring);
+
+ if (xdbc.table_dma)
+ free_bootmem(xdbc.table_dma, PAGE_SIZE);
+
+ if (xdbc.out_dma)
+ free_bootmem(xdbc.out_dma, PAGE_SIZE);
+
+ xdbc.table_base = NULL;
+ xdbc.out_buf = NULL;
+ }
+
+ return ret;
+}
+
+static void xdbc_handle_port_status(struct xdbc_trb *evt_trb)
+{
+ u32 port_reg;
+
+ port_reg = readl(&xdbc.xdbc_reg->portsc);
+ if (port_reg & PORTSC_CONN_CHANGE) {
+ xdbc_trace("connect status change event\n");
+
+ /* Check whether cable unplugged: */
+ if (!(port_reg & PORTSC_CONN_STATUS)) {
+ xdbc.flags = 0;
+ xdbc_trace("cable unplugged\n");
+ }
+ }
+
+ if (port_reg & PORTSC_RESET_CHANGE)
+ xdbc_trace("port reset change event\n");
+
+ if (port_reg & PORTSC_LINK_CHANGE)
+ xdbc_trace("port link status change event\n");
+
+ if (port_reg & PORTSC_CONFIG_CHANGE)
+ xdbc_trace("config error change\n");
+
+ /* Write back the value to clear RW1C bits: */
+ writel(port_reg, &xdbc.xdbc_reg->portsc);
+}
+
+static void xdbc_handle_tx_event(struct xdbc_trb *evt_trb)
+{
+ size_t remain_length;
+ u32 comp_code;
+ int ep_id;
+
+ comp_code = GET_COMP_CODE(le32_to_cpu(evt_trb->field[2]));
+ remain_length = EVENT_TRB_LEN(le32_to_cpu(evt_trb->field[2]));
+ ep_id = TRB_TO_EP_ID(le32_to_cpu(evt_trb->field[3]));
+
+ switch (comp_code) {
+ case COMP_SUCCESS:
+ remain_length = 0;
+ case COMP_SHORT_PACKET:
+ break;
+ case COMP_TRB_ERROR:
+ case COMP_BABBLE_DETECTED_ERROR:
+ case COMP_USB_TRANSACTION_ERROR:
+ case COMP_STALL_ERROR:
+ default:
+ if (ep_id == XDBC_EPID_OUT)
+ xdbc.flags |= XDBC_FLAGS_OUT_STALL;
+ if (ep_id == XDBC_EPID_IN)
+ xdbc.flags |= XDBC_FLAGS_IN_STALL;
+
+ xdbc_trace("endpoint %d stalled\n", ep_id);
+ break;
+ }
+
+ if (ep_id == XDBC_EPID_IN) {
+ xdbc.flags &= ~XDBC_FLAGS_IN_PROCESS;
+ xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+ } else if (ep_id == XDBC_EPID_OUT) {
+ xdbc.flags &= ~XDBC_FLAGS_OUT_PROCESS;
+ } else {
+ xdbc_trace("invalid endpoint id %d\n", ep_id);
+ }
+}
+
+static void xdbc_handle_events(void)
+{
+ struct xdbc_trb *evt_trb;
+ bool update_erdp = false;
+ u32 reg;
+ u8 cmd;
+
+ cmd = read_pci_config_byte(xdbc.bus, xdbc.dev, xdbc.func, PCI_COMMAND);
+ if (!(cmd & PCI_COMMAND_MASTER)) {
+ cmd |= PCI_COMMAND_MASTER | PCI_COMMAND_MEMORY;
+ write_pci_config_byte(xdbc.bus, xdbc.dev, xdbc.func, PCI_COMMAND, cmd);
+ }
+
+ if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED))
+ return;
+
+ /* Handle external reset events: */
+ reg = readl(&xdbc.xdbc_reg->control);
+ if (!(reg & CTRL_DBC_ENABLE)) {
+ if (xdbc_handle_external_reset()) {
+ xdbc_trace("failed to recover connection\n");
+ return;
+ }
+ }
+
+ /* Handle configure-exit event: */
+ reg = readl(&xdbc.xdbc_reg->control);
+ if (reg & CTRL_DBC_RUN_CHANGE) {
+ writel(reg, &xdbc.xdbc_reg->control);
+ if (reg & CTRL_DBC_RUN)
+ xdbc.flags |= XDBC_FLAGS_CONFIGURED;
+ else
+ xdbc.flags &= ~XDBC_FLAGS_CONFIGURED;
+ }
+
+ /* Handle endpoint stall event: */
+ reg = readl(&xdbc.xdbc_reg->control);
+ if (reg & CTRL_HALT_IN_TR) {
+ xdbc.flags |= XDBC_FLAGS_IN_STALL;
+ } else {
+ xdbc.flags &= ~XDBC_FLAGS_IN_STALL;
+ if (!(xdbc.flags & XDBC_FLAGS_IN_PROCESS))
+ xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+ }
+
+ if (reg & CTRL_HALT_OUT_TR)
+ xdbc.flags |= XDBC_FLAGS_OUT_STALL;
+ else
+ xdbc.flags &= ~XDBC_FLAGS_OUT_STALL;
+
+ /* Handle the events in the event ring: */
+ evt_trb = xdbc.evt_ring.dequeue;
+ while ((le32_to_cpu(evt_trb->field[3]) & TRB_CYCLE) == xdbc.evt_ring.cycle_state) {
+ /*
+ * Add a barrier between reading the cycle flag and any
+ * reads of the event's flags/data below:
+ */
+ rmb();
+
+ switch ((le32_to_cpu(evt_trb->field[3]) & TRB_TYPE_BITMASK)) {
+ case TRB_TYPE(TRB_PORT_STATUS):
+ xdbc_handle_port_status(evt_trb);
+ break;
+ case TRB_TYPE(TRB_TRANSFER):
+ xdbc_handle_tx_event(evt_trb);
+ break;
+ default:
+ break;
+ }
+
+ ++(xdbc.evt_ring.dequeue);
+ if (xdbc.evt_ring.dequeue == &xdbc.evt_seg.trbs[TRBS_PER_SEGMENT]) {
+ xdbc.evt_ring.dequeue = xdbc.evt_seg.trbs;
+ xdbc.evt_ring.cycle_state ^= 1;
+ }
+
+ evt_trb = xdbc.evt_ring.dequeue;
+ update_erdp = true;
+ }
+
+ /* Update event ring dequeue pointer: */
+ if (update_erdp)
+ xdbc_write64(__pa(xdbc.evt_ring.dequeue), &xdbc.xdbc_reg->erdp);
+}
+
+static int xdbc_bulk_write(const char *bytes, int size)
+{
+ int ret, timeout = 0;
+ unsigned long flags;
+
+retry:
+ if (in_nmi()) {
+ if (!raw_spin_trylock_irqsave(&xdbc.lock, flags))
+ return -EAGAIN;
+ } else {
+ raw_spin_lock_irqsave(&xdbc.lock, flags);
+ }
+
+ xdbc_handle_events();
+
+ /* Check completion of the previous request: */
+ if ((xdbc.flags & XDBC_FLAGS_OUT_PROCESS) && (timeout < 2000000)) {
+ raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+ udelay(100);
+ timeout += 100;
+ goto retry;
+ }
+
+ if (xdbc.flags & XDBC_FLAGS_OUT_PROCESS) {
+ raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+ xdbc_trace("previous transfer not completed yet\n");
+
+ return -ETIMEDOUT;
+ }
+
+ ret = xdbc_bulk_transfer((void *)bytes, size, false);
+ raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+
+ return ret;
+}
+
+static void early_xdbc_write(struct console *con, const char *str, u32 n)
+{
+ static char buf[XDBC_MAX_PACKET];
+ int chunk, ret;
+ int use_cr = 0;
+
+ if (!xdbc.xdbc_reg)
+ return;
+ memset(buf, 0, XDBC_MAX_PACKET);
+ while (n > 0) {
+ for (chunk = 0; chunk < XDBC_MAX_PACKET && n > 0; str++, chunk++, n--) {
+
+ if (!use_cr && *str == '\n') {
+ use_cr = 1;
+ buf[chunk] = '\r';
+ str--;
+ n++;
+ continue;
+ }
+
+ if (use_cr)
+ use_cr = 0;
+ buf[chunk] = *str;
+ }
+
+ if (chunk > 0) {
+ ret = xdbc_bulk_write(buf, chunk);
+ if (ret < 0)
+ xdbc_trace("missed message {%s}\n", buf);
+ }
+ }
+}
+
+static struct console early_xdbc_console = {
+ .name = "earlyxdbc",
+ .write = early_xdbc_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+void __init early_xdbc_register_console(void)
+{
+ if (early_console)
+ return;
+
+ early_console = &early_xdbc_console;
+ if (early_console_keep)
+ early_console->flags &= ~CON_BOOT;
+ else
+ early_console->flags |= CON_BOOT;
+ register_console(early_console);
+}
+
+static void xdbc_unregister_console(void)
+{
+ if (early_xdbc_console.flags & CON_ENABLED)
+ unregister_console(&early_xdbc_console);
+}
+
+static int xdbc_scrub_function(void *ptr)
+{
+ unsigned long flags;
+
+ while (true) {
+ raw_spin_lock_irqsave(&xdbc.lock, flags);
+ xdbc_handle_events();
+
+ if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED)) {
+ raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+ schedule_timeout_interruptible(1);
+ }
+
+ xdbc_unregister_console();
+ writel(0, &xdbc.xdbc_reg->control);
+ xdbc_trace("dbc scrub function exits\n");
+
+ return 0;
+}
+
+static int __init xdbc_init(void)
+{
+ unsigned long flags;
+ void __iomem *base;
+ int ret = 0;
+ u32 offset;
+
+ if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED))
+ return 0;
+
+ /*
+ * It's time to shut down the DbC, so that the debug
+ * port can be reused by the host controller:
+ */
+ if (early_xdbc_console.index == -1 ||
+ (early_xdbc_console.flags & CON_BOOT)) {
+ xdbc_trace("hardware not used anymore\n");
+ goto free_and_quit;
+ }
+
+ base = ioremap_nocache(xdbc.xhci_start, xdbc.xhci_length);
+ if (!base) {
+ xdbc_trace("failed to remap the io address\n");
+ ret = -ENOMEM;
+ goto free_and_quit;
+ }
+
+ raw_spin_lock_irqsave(&xdbc.lock, flags);
+ early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
+ xdbc.xhci_base = base;
+ offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_DEBUG);
+ xdbc.xdbc_reg = (struct xdbc_regs __iomem *)(xdbc.xhci_base + offset);
+ raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+
+ kthread_run(xdbc_scrub_function, NULL, "%s", "xdbc");
+
+ return 0;
+
+free_and_quit:
+ xdbc_free_ring(&xdbc.evt_ring);
+ xdbc_free_ring(&xdbc.out_ring);
+ xdbc_free_ring(&xdbc.in_ring);
+ free_bootmem(xdbc.table_dma, PAGE_SIZE);
+ free_bootmem(xdbc.out_dma, PAGE_SIZE);
+ writel(0, &xdbc.xdbc_reg->control);
+ early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
+
+ return ret;
+}
+subsys_initcall(xdbc_init);
diff --git a/drivers/usb/early/xhci-dbc.h b/drivers/usb/early/xhci-dbc.h
new file mode 100644
index 000000000000..2df0f6e613fe
--- /dev/null
+++ b/drivers/usb/early/xhci-dbc.h
@@ -0,0 +1,211 @@
+/*
+ * xhci-dbc.h - xHCI debug capability early driver
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_XHCI_DBC_H
+#define __LINUX_XHCI_DBC_H
+
+#include <linux/types.h>
+#include <linux/usb/ch9.h>
+
+/*
+ * xHCI Debug Capability Register interfaces:
+ */
+struct xdbc_regs {
+ __le32 capability;
+ __le32 doorbell;
+ __le32 ersts; /* Event Ring Segment Table Size*/
+ __le32 __reserved_0; /* 0c~0f reserved bits */
+ __le64 erstba; /* Event Ring Segment Table Base Address */
+ __le64 erdp; /* Event Ring Dequeue Pointer */
+ __le32 control;
+ __le32 status;
+ __le32 portsc; /* Port status and control */
+ __le32 __reserved_1; /* 2b~28 reserved bits */
+ __le64 dccp; /* Debug Capability Context Pointer */
+ __le32 devinfo1; /* Device Descriptor Info Register 1 */
+ __le32 devinfo2; /* Device Descriptor Info Register 2 */
+};
+
+#define DEBUG_MAX_BURST(p) (((p) >> 16) & 0xff)
+
+#define CTRL_DBC_RUN BIT(0)
+#define CTRL_PORT_ENABLE BIT(1)
+#define CTRL_HALT_OUT_TR BIT(2)
+#define CTRL_HALT_IN_TR BIT(3)
+#define CTRL_DBC_RUN_CHANGE BIT(4)
+#define CTRL_DBC_ENABLE BIT(31)
+
+#define DCST_DEBUG_PORT(p) (((p) >> 24) & 0xff)
+
+#define PORTSC_CONN_STATUS BIT(0)
+#define PORTSC_CONN_CHANGE BIT(17)
+#define PORTSC_RESET_CHANGE BIT(21)
+#define PORTSC_LINK_CHANGE BIT(22)
+#define PORTSC_CONFIG_CHANGE BIT(23)
+
+/*
+ * xHCI Debug Capability data structures:
+ */
+struct xdbc_trb {
+ __le32 field[4];
+};
+
+struct xdbc_erst_entry {
+ __le64 seg_addr;
+ __le32 seg_size;
+ __le32 __reserved_0;
+};
+
+struct xdbc_info_context {
+ __le64 string0;
+ __le64 manufacturer;
+ __le64 product;
+ __le64 serial;
+ __le32 length;
+ __le32 __reserved_0[7];
+};
+
+struct xdbc_ep_context {
+ __le32 ep_info1;
+ __le32 ep_info2;
+ __le64 deq;
+ __le32 tx_info;
+ __le32 __reserved_0[11];
+};
+
+struct xdbc_context {
+ struct xdbc_info_context info;
+ struct xdbc_ep_context out;
+ struct xdbc_ep_context in;
+};
+
+#define XDBC_INFO_CONTEXT_SIZE 48
+#define XDBC_MAX_STRING_LENGTH 64
+#define XDBC_STRING_MANUFACTURER "Linux"
+#define XDBC_STRING_PRODUCT "Remote GDB"
+#define XDBC_STRING_SERIAL "0001"
+
+struct xdbc_strings {
+ char string0[XDBC_MAX_STRING_LENGTH];
+ char manufacturer[XDBC_MAX_STRING_LENGTH];
+ char product[XDBC_MAX_STRING_LENGTH];
+ char serial[XDBC_MAX_STRING_LENGTH];
+};
+
+#define XDBC_PROTOCOL 1 /* GNU Remote Debug Command Set */
+#define XDBC_VENDOR_ID 0x1d6b /* Linux Foundation 0x1d6b */
+#define XDBC_PRODUCT_ID 0x0004 /* __le16 idProduct; device 0004 */
+#define XDBC_DEVICE_REV 0x0010 /* 0.10 */
+
+/*
+ * xHCI Debug Capability software state structures:
+ */
+struct xdbc_segment {
+ struct xdbc_trb *trbs;
+ dma_addr_t dma;
+};
+
+#define XDBC_TRBS_PER_SEGMENT 256
+
+struct xdbc_ring {
+ struct xdbc_segment *segment;
+ struct xdbc_trb *enqueue;
+ struct xdbc_trb *dequeue;
+ u32 cycle_state;
+};
+
+#define XDBC_EPID_OUT 2
+#define XDBC_EPID_IN 3
+
+struct xdbc_state {
+ u16 vendor;
+ u16 device;
+ u32 bus;
+ u32 dev;
+ u32 func;
+ void __iomem *xhci_base;
+ u64 xhci_start;
+ size_t xhci_length;
+ int port_number;
+
+ /* DbC register base */
+ struct xdbc_regs __iomem *xdbc_reg;
+
+ /* DbC table page */
+ dma_addr_t table_dma;
+ void *table_base;
+
+ /* event ring segment table */
+ dma_addr_t erst_dma;
+ size_t erst_size;
+ void *erst_base;
+
+ /* event ring segments */
+ struct xdbc_ring evt_ring;
+ struct xdbc_segment evt_seg;
+
+ /* debug capability contexts */
+ dma_addr_t dbcc_dma;
+ size_t dbcc_size;
+ void *dbcc_base;
+
+ /* descriptor strings */
+ dma_addr_t string_dma;
+ size_t string_size;
+ void *string_base;
+
+ /* bulk OUT endpoint */
+ struct xdbc_ring out_ring;
+ struct xdbc_segment out_seg;
+ void *out_buf;
+ dma_addr_t out_dma;
+
+ /* bulk IN endpoint */
+ struct xdbc_ring in_ring;
+ struct xdbc_segment in_seg;
+ void *in_buf;
+ dma_addr_t in_dma;
+
+ u32 flags;
+
+ /* spinlock for early_xdbc_write() reentrancy */
+ raw_spinlock_t lock;
+};
+
+#define XDBC_PCI_MAX_BUSES 256
+#define XDBC_PCI_MAX_DEVICES 32
+#define XDBC_PCI_MAX_FUNCTION 8
+
+#define XDBC_TABLE_ENTRY_SIZE 64
+#define XDBC_ERST_ENTRY_NUM 1
+#define XDBC_DBCC_ENTRY_NUM 3
+#define XDBC_STRING_ENTRY_NUM 4
+
+/* Bits definitions for xdbc_state.flags: */
+#define XDBC_FLAGS_INITIALIZED BIT(0)
+#define XDBC_FLAGS_IN_STALL BIT(1)
+#define XDBC_FLAGS_OUT_STALL BIT(2)
+#define XDBC_FLAGS_IN_PROCESS BIT(3)
+#define XDBC_FLAGS_OUT_PROCESS BIT(4)
+#define XDBC_FLAGS_CONFIGURED BIT(5)
+
+#define XDBC_MAX_PACKET 1024
+
+/* Door bell target: */
+#define OUT_EP_DOORBELL 0
+#define IN_EP_DOORBELL 1
+#define DOOR_BELL_TARGET(p) (((p) & 0xff) << 8)
+
+#define xdbc_read64(regs) xhci_read_64(NULL, (regs))
+#define xdbc_write64(val, regs) xhci_write_64(NULL, (val), (regs))
+
+#endif /* __LINUX_XHCI_DBC_H */
diff --git a/drivers/usb/serial/usb_debug.c b/drivers/usb/serial/usb_debug.c
index ca2fa5bbe17e..92f7e5c21162 100644
--- a/drivers/usb/serial/usb_debug.c
+++ b/drivers/usb/serial/usb_debug.c
@@ -32,7 +32,18 @@ static const struct usb_device_id id_table[] = {
{ USB_DEVICE(0x0525, 0x127a) },
{ },
};
-MODULE_DEVICE_TABLE(usb, id_table);
+
+static const struct usb_device_id dbc_id_table[] = {
+ { USB_DEVICE(0x1d6b, 0x0004) },
+ { },
+};
+
+static const struct usb_device_id id_table_combined[] = {
+ { USB_DEVICE(0x0525, 0x127a) },
+ { USB_DEVICE(0x1d6b, 0x0004) },
+ { },
+};
+MODULE_DEVICE_TABLE(usb, id_table_combined);
/* This HW really does not support a serial break, so one will be
* emulated when ever the break state is set to true.
@@ -71,9 +82,20 @@ static struct usb_serial_driver debug_device = {
.process_read_urb = usb_debug_process_read_urb,
};
+static struct usb_serial_driver dbc_device = {
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = "xhci_dbc",
+ },
+ .id_table = dbc_id_table,
+ .num_ports = 1,
+ .break_ctl = usb_debug_break_ctl,
+ .process_read_urb = usb_debug_process_read_urb,
+};
+
static struct usb_serial_driver * const serial_drivers[] = {
- &debug_device, NULL
+ &debug_device, &dbc_device, NULL
};
-module_usb_serial_driver(serial_drivers, id_table);
+module_usb_serial_driver(serial_drivers, id_table_combined);
MODULE_LICENSE("GPL");
diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c
index 8c4dc1e1f94f..b827a8113e26 100644
--- a/drivers/video/fbdev/efifb.c
+++ b/drivers/video/fbdev/efifb.c
@@ -10,6 +10,7 @@
#include <linux/efi.h>
#include <linux/errno.h>
#include <linux/fb.h>
+#include <linux/pci.h>
#include <linux/platform_device.h>
#include <linux/screen_info.h>
#include <video/vga.h>
@@ -143,6 +144,8 @@ static struct attribute *efifb_attrs[] = {
};
ATTRIBUTE_GROUPS(efifb);
+static bool pci_dev_disabled; /* FB base matches BAR of a disabled device */
+
static int efifb_probe(struct platform_device *dev)
{
struct fb_info *info;
@@ -152,7 +155,7 @@ static int efifb_probe(struct platform_device *dev)
unsigned int size_total;
char *option = NULL;
- if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI)
+ if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || pci_dev_disabled)
return -ENODEV;
if (fb_get_options("efifb", &option))
@@ -360,3 +363,64 @@ static struct platform_driver efifb_driver = {
};
builtin_platform_driver(efifb_driver);
+
+#if defined(CONFIG_PCI) && !defined(CONFIG_X86)
+
+static bool pci_bar_found; /* did we find a BAR matching the efifb base? */
+
+static void claim_efifb_bar(struct pci_dev *dev, int idx)
+{
+ u16 word;
+
+ pci_bar_found = true;
+
+ pci_read_config_word(dev, PCI_COMMAND, &word);
+ if (!(word & PCI_COMMAND_MEMORY)) {
+ pci_dev_disabled = true;
+ dev_err(&dev->dev,
+ "BAR %d: assigned to efifb but device is disabled!\n",
+ idx);
+ return;
+ }
+
+ if (pci_claim_resource(dev, idx)) {
+ pci_dev_disabled = true;
+ dev_err(&dev->dev,
+ "BAR %d: failed to claim resource for efifb!\n", idx);
+ return;
+ }
+
+ dev_info(&dev->dev, "BAR %d: assigned to efifb\n", idx);
+}
+
+static void efifb_fixup_resources(struct pci_dev *dev)
+{
+ u64 base = screen_info.lfb_base;
+ u64 size = screen_info.lfb_size;
+ int i;
+
+ if (pci_bar_found || screen_info.orig_video_isVGA != VIDEO_TYPE_EFI)
+ return;
+
+ if (screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE)
+ base |= (u64)screen_info.ext_lfb_base << 32;
+
+ if (!base)
+ return;
+
+ for (i = 0; i < PCI_STD_RESOURCE_END; i++) {
+ struct resource *res = &dev->resource[i];
+
+ if (!(res->flags & IORESOURCE_MEM))
+ continue;
+
+ if (res->start <= base && res->end >= base + size - 1) {
+ claim_efifb_bar(dev, i);
+ break;
+ }
+ }
+}
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID, PCI_BASE_CLASS_DISPLAY,
+ 16, efifb_fixup_resources);
+
+#endif
diff --git a/fs/exec.c b/fs/exec.c
index 65145a3df065..72934df68471 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1320,6 +1320,7 @@ void setup_new_exec(struct linux_binprm * bprm)
else
set_dumpable(current->mm, suid_dumpable);
+ arch_setup_new_exec();
perf_event_exec();
__set_task_comm(current, kbasename(bprm->filename), true);
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 6f96247226a4..5fe9f39cec44 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -5,6 +5,8 @@
#ifdef CONFIG_GENERIC_BUG
#define BUGFLAG_WARNING (1 << 0)
+#define BUGFLAG_ONCE (1 << 1)
+#define BUGFLAG_DONE (1 << 2)
#define BUGFLAG_TAINT(taint) (BUGFLAG_WARNING | ((taint) << 8))
#define BUG_GET_TAINT(bug) ((bug)->flags >> 8)
#endif
@@ -55,6 +57,18 @@ struct bug_entry {
#define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
#endif
+#ifdef __WARN_FLAGS
+#define __WARN_TAINT(taint) __WARN_FLAGS(BUGFLAG_TAINT(taint))
+#define __WARN_ONCE_TAINT(taint) __WARN_FLAGS(BUGFLAG_ONCE|BUGFLAG_TAINT(taint))
+
+#define WARN_ON_ONCE(condition) ({ \
+ int __ret_warn_on = !!(condition); \
+ if (unlikely(__ret_warn_on)) \
+ __WARN_ONCE_TAINT(TAINT_WARN); \
+ unlikely(__ret_warn_on); \
+})
+#endif
+
/*
* WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report
* significant issues that need prompt attention if they should ever
@@ -97,7 +111,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
#endif
#ifndef WARN
-#define WARN(condition, format...) ({ \
+#define WARN(condition, format...) ({ \
int __ret_warn_on = !!(condition); \
if (unlikely(__ret_warn_on)) \
__WARN_printf(format); \
@@ -112,6 +126,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
unlikely(__ret_warn_on); \
})
+#ifndef WARN_ON_ONCE
#define WARN_ON_ONCE(condition) ({ \
static bool __section(.data.unlikely) __warned; \
int __ret_warn_once = !!(condition); \
@@ -122,6 +137,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
} \
unlikely(__ret_warn_once); \
})
+#endif
#define WARN_ONCE(condition, format...) ({ \
static bool __section(.data.unlikely) __warned; \
diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h
index cc5d9a1405df..41e5b6784b97 100644
--- a/include/asm-generic/mm_hooks.h
+++ b/include/asm-generic/mm_hooks.h
@@ -32,10 +32,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
/* by default, allow everything */
return true;
}
-
-static inline bool arch_pte_access_permitted(pte_t pte, bool write)
-{
- /* by default, allow everything */
- return true;
-}
#endif /* _ASM_GENERIC_MM_HOOKS_H */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 1fad160f35de..7dfa767dc680 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -341,6 +341,31 @@ static inline int pte_unused(pte_t pte)
}
#endif
+#ifndef pte_access_permitted
+#define pte_access_permitted(pte, write) \
+ (pte_present(pte) && (!(write) || pte_write(pte)))
+#endif
+
+#ifndef pmd_access_permitted
+#define pmd_access_permitted(pmd, write) \
+ (pmd_present(pmd) && (!(write) || pmd_write(pmd)))
+#endif
+
+#ifndef pud_access_permitted
+#define pud_access_permitted(pud, write) \
+ (pud_present(pud) && (!(write) || pud_write(pud)))
+#endif
+
+#ifndef p4d_access_permitted
+#define p4d_access_permitted(p4d, write) \
+ (p4d_present(p4d) && (!(write) || p4d_write(p4d)))
+#endif
+
+#ifndef pgd_access_permitted
+#define pgd_access_permitted(pgd, write) \
+ (pgd_present(pgd) && (!(write) || pgd_write(pgd)))
+#endif
+
#ifndef __HAVE_ARCH_PMD_SAME
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 7cdfe167074f..8fba6352cb5c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -287,8 +287,6 @@
*(.rodata1) \
} \
\
- BUG_TABLE \
- \
/* PCI quirks */ \
.pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \
VMLINUX_SYMBOL(__start_pci_fixups_early) = .; \
@@ -857,7 +855,8 @@
READ_MOSTLY_DATA(cacheline) \
DATA_DATA \
CONSTRUCTORS \
- }
+ } \
+ BUG_TABLE
#define INIT_TEXT_SECTION(inittext_align) \
. = ALIGN(inittext_align); \
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index e71835bf60a9..c56be7410130 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -423,6 +423,29 @@
#endif
#endif /* atomic_cmpxchg_relaxed */
+#ifndef atomic_try_cmpxchg
+
+#define __atomic_try_cmpxchg(type, _p, _po, _n) \
+({ \
+ typeof(_po) __po = (_po); \
+ typeof(*(_po)) __r, __o = *__po; \
+ __r = atomic_cmpxchg##type((_p), __o, (_n)); \
+ if (unlikely(__r != __o)) \
+ *__po = __r; \
+ likely(__r == __o); \
+})
+
+#define atomic_try_cmpxchg(_p, _po, _n) __atomic_try_cmpxchg(, _p, _po, _n)
+#define atomic_try_cmpxchg_relaxed(_p, _po, _n) __atomic_try_cmpxchg(_relaxed, _p, _po, _n)
+#define atomic_try_cmpxchg_acquire(_p, _po, _n) __atomic_try_cmpxchg(_acquire, _p, _po, _n)
+#define atomic_try_cmpxchg_release(_p, _po, _n) __atomic_try_cmpxchg(_release, _p, _po, _n)
+
+#else /* atomic_try_cmpxchg */
+#define atomic_try_cmpxchg_relaxed atomic_try_cmpxchg
+#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg
+#define atomic_try_cmpxchg_release atomic_try_cmpxchg
+#endif /* atomic_try_cmpxchg */
+
/* cmpxchg_relaxed */
#ifndef cmpxchg_relaxed
#define cmpxchg_relaxed cmpxchg
@@ -996,6 +1019,29 @@ static inline int atomic_dec_if_positive(atomic_t *v)
#endif
#endif /* atomic64_cmpxchg_relaxed */
+#ifndef atomic64_try_cmpxchg
+
+#define __atomic64_try_cmpxchg(type, _p, _po, _n) \
+({ \
+ typeof(_po) __po = (_po); \
+ typeof(*(_po)) __r, __o = *__po; \
+ __r = atomic64_cmpxchg##type((_p), __o, (_n)); \
+ if (unlikely(__r != __o)) \
+ *__po = __r; \
+ likely(__r == __o); \
+})
+
+#define atomic64_try_cmpxchg(_p, _po, _n) __atomic64_try_cmpxchg(, _p, _po, _n)
+#define atomic64_try_cmpxchg_relaxed(_p, _po, _n) __atomic64_try_cmpxchg(_relaxed, _p, _po, _n)
+#define atomic64_try_cmpxchg_acquire(_p, _po, _n) __atomic64_try_cmpxchg(_acquire, _p, _po, _n)
+#define atomic64_try_cmpxchg_release(_p, _po, _n) __atomic64_try_cmpxchg(_release, _p, _po, _n)
+
+#else /* atomic64_try_cmpxchg */
+#define atomic64_try_cmpxchg_relaxed atomic64_try_cmpxchg
+#define atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg
+#define atomic64_try_cmpxchg_release atomic64_try_cmpxchg
+#endif /* atomic64_try_cmpxchg */
+
#ifndef atomic64_andnot
static inline void atomic64_andnot(long long i, atomic64_t *v)
{
diff --git a/include/linux/bug.h b/include/linux/bug.h
index 5828489309bb..687b557fc5eb 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -105,7 +105,7 @@ static inline int is_warning_bug(const struct bug_entry *bug)
return bug->flags & BUGFLAG_WARNING;
}
-const struct bug_entry *find_bug(unsigned long bugaddr);
+struct bug_entry *find_bug(unsigned long bugaddr);
enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 6d7edc3082f9..acc9ce05e5f0 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -182,7 +182,6 @@ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *e
extern void clockevents_register_device(struct clock_event_device *dev);
extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu);
-extern void clockevents_config(struct clock_event_device *dev, u32 freq);
extern void clockevents_config_and_register(struct clock_event_device *dev,
u32 freq, unsigned long min_delta,
unsigned long max_delta);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 993c87182e02..a4895e5107a0 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -730,6 +730,8 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
int, const char __user *);
+asmlinkage long compat_sys_arch_prctl(int option, unsigned long arg2);
+
/*
* For most but not all architectures, "am I in a compat syscall?" and
* "am I a compat task?" are the same question. For architectures on which
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 2a5982c37dfb..035c16c9a505 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -201,7 +201,7 @@ struct coresight_ops_sink {
void *sink_config);
unsigned long (*reset_buffer)(struct coresight_device *csdev,
struct perf_output_handle *handle,
- void *sink_config, bool *lost);
+ void *sink_config);
void (*update_buffer)(struct coresight_device *csdev,
struct perf_output_handle *handle,
void *sink_config);
diff --git a/include/linux/efi-bgrt.h b/include/linux/efi-bgrt.h
index 2fd3993c370b..e6f624b53c3d 100644
--- a/include/linux/efi-bgrt.h
+++ b/include/linux/efi-bgrt.h
@@ -6,6 +6,7 @@
#ifdef CONFIG_ACPI_BGRT
void efi_bgrt_init(struct acpi_table_header *table);
+int __init acpi_parse_bgrt(struct acpi_table_header *table);
/* The BGRT data itself; only valid if bgrt_image != NULL. */
extern size_t bgrt_image_size;
@@ -14,6 +15,10 @@ extern struct acpi_table_bgrt bgrt_tab;
#else /* !CONFIG_ACPI_BGRT */
static inline void efi_bgrt_init(struct acpi_table_header *table) {}
+static inline int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+ return 0;
+}
#endif /* !CONFIG_ACPI_BGRT */
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 94d34e0be24f..ec36f42a2add 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1435,9 +1435,6 @@ static inline int efi_runtime_map_copy(void *buf, size_t bufsz)
/* prototypes shared between arch specific and generic stub code */
-#define pr_efi(sys_table, msg) efi_printk(sys_table, "EFI stub: "msg)
-#define pr_efi_err(sys_table, msg) efi_printk(sys_table, "EFI stub: ERROR: "msg)
-
void efi_printk(efi_system_table_t *sys_table_arg, char *str);
void efi_free(efi_system_table_t *sys_table_arg, unsigned long size,
@@ -1471,7 +1468,7 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
unsigned long *load_addr,
unsigned long *load_size);
-efi_status_t efi_parse_options(char *cmdline);
+efi_status_t efi_parse_options(char const *cmdline);
efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
struct screen_info *si, efi_guid_t *proto,
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 249e579ecd4c..23d58fcd4d9a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -276,8 +276,6 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
return timer->base->cpu_base->hres_active;
}
-extern void hrtimer_peek_ahead_timers(void);
-
/*
* The resolution of the clocks. The resolution value is returned in
* the clock_getres() system call to give application programmers an
@@ -300,8 +298,6 @@ extern unsigned int hrtimer_resolution;
#define hrtimer_resolution (unsigned int)LOW_RES_NSEC
-static inline void hrtimer_peek_ahead_timers(void) { }
-
static inline int hrtimer_is_hres_active(struct hrtimer *timer)
{
return 0;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 926f2f553cc5..7b99759fc324 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -181,6 +181,7 @@ extern struct cred init_cred;
#ifdef CONFIG_RT_MUTEXES
# define INIT_RT_MUTEXES(tsk) \
.pi_waiters = RB_ROOT, \
+ .pi_top_task = NULL, \
.pi_waiters_leftmost = NULL,
#else
# define INIT_RT_MUTEXES(tsk)
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index c328e4f7dcad..47e4da5b4fa2 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -267,6 +267,8 @@ extern int arch_init_kprobes(void);
extern void show_registers(struct pt_regs *regs);
extern void kprobes_inc_nmissed_count(struct kprobe *p);
extern bool arch_within_kprobe_blacklist(unsigned long addr);
+extern bool arch_function_offset_within_entry(unsigned long offset);
+extern bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
extern bool within_kprobe_blacklist(unsigned long addr);
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 1e327bb80838..fffe49f188e6 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -361,6 +361,8 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
lock_set_class(lock, lock->name, lock->key, subclass, ip);
}
+extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);
+
extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
extern void lockdep_clear_current_reclaim_state(void);
extern void lockdep_trace_alloc(gfp_t mask);
@@ -411,6 +413,7 @@ static inline void lockdep_on(void)
# define lock_acquire(l, s, t, r, c, n, i) do { } while (0)
# define lock_release(l, n, i) do { } while (0)
+# define lock_downgrade(l, i) do { } while (0)
# define lock_set_class(l, n, k, s, i) do { } while (0)
# define lock_set_subclass(l, s, i) do { } while (0)
# define lockdep_set_current_reclaim_state(g) do { } while (0)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00a8fa7e366a..a835edd2db34 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -432,6 +432,10 @@ static inline int pud_devmap(pud_t pud)
{
return 0;
}
+static inline int pgd_devmap(pgd_t pgd)
+{
+ return 0;
+}
#endif
/*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f60f45fe226f..45cdb27791a3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -367,6 +367,11 @@ struct mm_struct {
#endif
unsigned long mmap_base; /* base of mmap area */
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+ /* Base adresses for compatible mmap() */
+ unsigned long mmap_compat_base;
+ unsigned long mmap_compat_legacy_base;
+#endif
unsigned long task_size; /* size of task vm space */
unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd;
diff --git a/include/linux/module.h b/include/linux/module.h
index 0297c5cd7cdf..9ad68561d8c2 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -493,6 +493,7 @@ static inline int module_is_live(struct module *mod)
struct module *__module_text_address(unsigned long addr);
struct module *__module_address(unsigned long addr);
bool is_module_address(unsigned long addr);
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
bool is_module_percpu_address(unsigned long addr);
bool is_module_text_address(unsigned long addr);
@@ -660,6 +661,11 @@ static inline bool is_module_percpu_address(unsigned long addr)
return false;
}
+static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+ return false;
+}
+
static inline bool is_module_text_address(unsigned long addr)
{
return false;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 84943e8057ef..316a19f6b635 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -148,7 +148,7 @@ static inline int page_cache_get_speculative(struct page *page)
#ifdef CONFIG_TINY_RCU
# ifdef CONFIG_PREEMPT_COUNT
- VM_BUG_ON(!in_atomic());
+ VM_BUG_ON(!in_atomic() && !irqs_disabled());
# endif
/*
* Preempt must be disabled here - we rely on rcu_read_lock doing
@@ -186,7 +186,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
# ifdef CONFIG_PREEMPT_COUNT
- VM_BUG_ON(!in_atomic());
+ VM_BUG_ON(!in_atomic() && !irqs_disabled());
# endif
VM_BUG_ON_PAGE(page_count(page) == 0, page);
page_ref_add(page, count);
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 56939d3f6e53..491b3f5a5f8a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -110,6 +110,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
#endif
extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
+extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
extern bool is_kernel_percpu_address(unsigned long addr);
#if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 000fdb211c7d..24a635887f28 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -165,6 +165,13 @@ struct hw_perf_event {
struct list_head bp_list;
};
#endif
+ struct { /* amd_iommu */
+ u8 iommu_bank;
+ u8 iommu_cntr;
+ u16 padding;
+ u64 conf;
+ u64 conf1;
+ };
};
/*
* If the event is a per task event, this will point to the task in
@@ -801,6 +808,7 @@ struct perf_output_handle {
struct ring_buffer *rb;
unsigned long wakeup;
unsigned long size;
+ u64 aux_flags;
union {
void *addr;
unsigned long head;
@@ -849,10 +857,11 @@ perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
extern void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
- unsigned long size, bool truncated);
+ unsigned long size);
extern int perf_aux_output_skip(struct perf_output_handle *handle,
unsigned long size);
extern void *perf_get_aux(struct perf_output_handle *handle);
+extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
extern void perf_pmu_unregister(struct pmu *pmu);
@@ -1112,6 +1121,7 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks
extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
+extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
/* Callchains */
@@ -1267,8 +1277,8 @@ static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *event) { return NULL; }
static inline void
-perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
- bool truncated) { }
+perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
+ { }
static inline int
perf_aux_output_skip(struct perf_output_handle *handle,
unsigned long size) { return -EINVAL; }
@@ -1315,6 +1325,7 @@ static inline int perf_unregister_guest_info_callbacks
static inline void perf_event_mmap(struct vm_area_struct *vma) { }
static inline void perf_event_exec(void) { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec) { }
+static inline void perf_event_namespaces(struct task_struct *tsk) { }
static inline void perf_event_fork(struct task_struct *tsk) { }
static inline void perf_event_init(void) { }
static inline int perf_swevent_get_recursion_context(void) { return -1; }
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 2aceeafd6fe5..ffb147185e8d 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -1,14 +1,25 @@
#ifndef __RAS_H__
#define __RAS_H__
+#include <asm/errno.h>
+
#ifdef CONFIG_DEBUG_FS
int ras_userspace_consumers(void);
void ras_debugfs_init(void);
int ras_add_daemon_trace(void);
#else
static inline int ras_userspace_consumers(void) { return 0; }
-static inline void ras_debugfs_init(void) { return; }
+static inline void ras_debugfs_init(void) { }
static inline int ras_add_daemon_trace(void) { return 0; }
#endif
+#ifdef CONFIG_RAS_CEC
+void __init cec_init(void);
+int __init parse_cec_param(char *str);
+int cec_add_elem(u64 pfn);
+#else
+static inline void __init cec_init(void) { }
+static inline int cec_add_elem(u64 pfn) { return -ENODEV; }
#endif
+
+#endif /* __RAS_H__ */
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 0023fee4bbbc..b34aa649d204 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -6,17 +6,36 @@
#include <linux/spinlock.h>
#include <linux/kernel.h>
+/**
+ * refcount_t - variant of atomic_t specialized for reference counts
+ * @refs: atomic_t counter field
+ *
+ * The counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free bugs.
+ */
typedef struct refcount_struct {
atomic_t refs;
} refcount_t;
#define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), }
+/**
+ * refcount_set - set a refcount's value
+ * @r: the refcount
+ * @n: value to which the refcount will be set
+ */
static inline void refcount_set(refcount_t *r, unsigned int n)
{
atomic_set(&r->refs, n);
}
+/**
+ * refcount_read - get a refcount's value
+ * @r: the refcount
+ *
+ * Return: the refcount's value
+ */
static inline unsigned int refcount_read(const refcount_t *r)
{
return atomic_read(&r->refs);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9e082e796963..09bade3737ce 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -775,6 +775,8 @@ struct task_struct {
/* PI waiters blocked on a rt_mutex held by this task: */
struct rb_root pi_waiters;
struct rb_node *pi_waiters_leftmost;
+ /* Updated under owner's pi_lock and rq lock */
+ struct task_struct *pi_top_task;
/* Deadlock detection and priority inheritance handling: */
struct rt_mutex_waiter *pi_blocked_on;
#endif
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 3bd668414f61..f93329aba31a 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,27 +18,20 @@ static inline int rt_task(struct task_struct *p)
}
#ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+/*
+ * Must hold either p->pi_lock or task_rq(p)->lock.
+ */
+static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
+{
+ return p->pi_top_task;
+}
+extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
return tsk->pi_blocked_on != NULL;
}
#else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
- return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
- int newprio)
-{
- return newprio;
-}
-
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
return NULL;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 8e0cb7a0f836..68123c1fe549 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus;
extern void __init setup_nr_cpu_ids(void);
extern void __init smp_init(void);
+extern int __boot_cpu_id;
+
+static inline int get_boot_cpu_id(void)
+{
+ return __boot_cpu_id;
+}
+
#else /* !SMP */
static inline void smp_send_stop(void) { }
@@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); }
static inline void smp_init(void) { }
#endif
+static inline int get_boot_cpu_id(void)
+{
+ return 0;
+}
+
#endif /* !SMP */
/*
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 0dbe41be6181..d7d3ea637dd0 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -113,6 +113,10 @@ static inline void check_object_size(const void *ptr, unsigned long n,
{ }
#endif /* CONFIG_HARDENED_USERCOPY */
+#ifndef arch_setup_new_exec
+static inline void arch_setup_new_exec(void) { }
+#endif
+
#endif /* __KERNEL__ */
#endif /* _LINUX_THREAD_INFO_H */
diff --git a/include/linux/usb/xhci-dbgp.h b/include/linux/usb/xhci-dbgp.h
new file mode 100644
index 000000000000..80c1cca1f529
--- /dev/null
+++ b/include/linux/usb/xhci-dbgp.h
@@ -0,0 +1,29 @@
+/*
+ * Standalone xHCI debug capability driver
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_XHCI_DBGP_H
+#define __LINUX_XHCI_DBGP_H
+
+#ifdef CONFIG_EARLY_PRINTK_USB_XDBC
+int __init early_xdbc_parse_parameter(char *s);
+int __init early_xdbc_setup_hardware(void);
+void __init early_xdbc_register_console(void);
+#else
+static inline int __init early_xdbc_setup_hardware(void)
+{
+ return -ENODEV;
+}
+static inline void __init early_xdbc_register_console(void)
+{
+}
+#endif /* CONFIG_EARLY_PRINTK_USB_XDBC */
+#endif /* __LINUX_XHCI_DBGP_H */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9e3ef6c99e4b..ae1409ffe99a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -70,7 +70,7 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
- __entry->prio = p->prio;
+ __entry->prio = p->prio; /* XXX SCHED_DEADLINE */
__entry->success = 1; /* rudiment, kill when possible */
__entry->target_cpu = task_cpu(p);
),
@@ -147,6 +147,7 @@ TRACE_EVENT(sched_switch,
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
__entry->next_pid = next->pid;
__entry->next_prio = next->prio;
+ /* XXX SCHED_DEADLINE */
),
TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
@@ -181,7 +182,7 @@ TRACE_EVENT(sched_migrate_task,
TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
- __entry->prio = p->prio;
+ __entry->prio = p->prio; /* XXX SCHED_DEADLINE */
__entry->orig_cpu = task_cpu(p);
__entry->dest_cpu = dest_cpu;
),
@@ -206,7 +207,7 @@ DECLARE_EVENT_CLASS(sched_process_template,
TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
- __entry->prio = p->prio;
+ __entry->prio = p->prio; /* XXX SCHED_DEADLINE */
),
TP_printk("comm=%s pid=%d prio=%d",
@@ -253,7 +254,7 @@ TRACE_EVENT(sched_process_wait,
TP_fast_assign(
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
__entry->pid = pid_nr(pid);
- __entry->prio = current->prio;
+ __entry->prio = current->prio; /* XXX SCHED_DEADLINE */
),
TP_printk("comm=%s pid=%d prio=%d",
@@ -413,9 +414,9 @@ DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime,
*/
TRACE_EVENT(sched_pi_setprio,
- TP_PROTO(struct task_struct *tsk, int newprio),
+ TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task),
- TP_ARGS(tsk, newprio),
+ TP_ARGS(tsk, pi_task),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
@@ -428,7 +429,8 @@ TRACE_EVENT(sched_pi_setprio,
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->oldprio = tsk->prio;
- __entry->newprio = newprio;
+ __entry->newprio = pi_task ? pi_task->prio : tsk->prio;
+ /* XXX SCHED_DEADLINE bits missing */
),
TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index bce990f5a35d..31acce9019a6 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -241,21 +241,21 @@ TRACE_EVENT(xen_mmu_set_pud,
(int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval)
);
-TRACE_EVENT(xen_mmu_set_pgd,
- TP_PROTO(pgd_t *pgdp, pgd_t *user_pgdp, pgd_t pgdval),
- TP_ARGS(pgdp, user_pgdp, pgdval),
+TRACE_EVENT(xen_mmu_set_p4d,
+ TP_PROTO(p4d_t *p4dp, p4d_t *user_p4dp, p4d_t p4dval),
+ TP_ARGS(p4dp, user_p4dp, p4dval),
TP_STRUCT__entry(
- __field(pgd_t *, pgdp)
- __field(pgd_t *, user_pgdp)
- __field(pgdval_t, pgdval)
- ),
- TP_fast_assign(__entry->pgdp = pgdp;
- __entry->user_pgdp = user_pgdp;
- __entry->pgdval = pgdval.pgd),
- TP_printk("pgdp %p user_pgdp %p pgdval %0*llx (raw %0*llx)",
- __entry->pgdp, __entry->user_pgdp,
- (int)sizeof(pgdval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->pgdval)),
- (int)sizeof(pgdval_t) * 2, (unsigned long long)__entry->pgdval)
+ __field(p4d_t *, p4dp)
+ __field(p4d_t *, user_p4dp)
+ __field(p4dval_t, p4dval)
+ ),
+ TP_fast_assign(__entry->p4dp = p4dp;
+ __entry->user_p4dp = user_p4dp;
+ __entry->p4dval = p4d_val(p4dval)),
+ TP_printk("p4dp %p user_p4dp %p p4dval %0*llx (raw %0*llx)",
+ __entry->p4dp, __entry->user_p4dp,
+ (int)sizeof(p4dval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->p4dval)),
+ (int)sizeof(p4dval_t) * 2, (unsigned long long)__entry->p4dval)
);
TRACE_EVENT(xen_mmu_pud_clear,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c66a485a24ac..d09a9cd021b1 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -344,7 +344,8 @@ struct perf_event_attr {
use_clockid : 1, /* use @clockid for time fields */
context_switch : 1, /* context switch data */
write_backward : 1, /* Write ring buffer from end to beginning */
- __reserved_1 : 36;
+ namespaces : 1, /* include namespaces data */
+ __reserved_1 : 35;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -610,6 +611,23 @@ struct perf_event_header {
__u16 size;
};
+struct perf_ns_link_info {
+ __u64 dev;
+ __u64 ino;
+};
+
+enum {
+ NET_NS_INDEX = 0,
+ UTS_NS_INDEX = 1,
+ IPC_NS_INDEX = 2,
+ PID_NS_INDEX = 3,
+ USER_NS_INDEX = 4,
+ MNT_NS_INDEX = 5,
+ CGROUP_NS_INDEX = 6,
+
+ NR_NAMESPACES, /* number of available namespaces */
+};
+
enum perf_event_type {
/*
@@ -862,6 +880,18 @@ enum perf_event_type {
*/
PERF_RECORD_SWITCH_CPU_WIDE = 15,
+ /*
+ * struct {
+ * struct perf_event_header header;
+ * u32 pid;
+ * u32 tid;
+ * u64 nr_namespaces;
+ * { u64 dev, inode; } [nr_namespaces];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_NAMESPACES = 16,
+
PERF_RECORD_MAX, /* non-ABI */
};
@@ -885,6 +915,7 @@ enum perf_callchain_context {
*/
#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */
#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */
+#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */
#define PERF_FLAG_FD_NO_GROUP (1UL << 0)
#define PERF_FLAG_FD_OUTPUT (1UL << 1)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 37b223e4fc05..9ae6fbe5b5cf 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
#endif /* CONFIG_PM_SLEEP_SMP */
+int __boot_cpu_id;
+
#endif /* CONFIG_SMP */
/* Boot processor state steps */
@@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void)
set_cpu_active(cpu, true);
set_cpu_present(cpu, true);
set_cpu_possible(cpu, true);
+
+#ifdef CONFIG_SMP
+ __boot_cpu_id = cpu;
+#endif
}
/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ff01cba86f43..6e75a5c9412d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -48,6 +48,8 @@
#include <linux/parser.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
+#include <linux/proc_ns.h>
+#include <linux/mount.h>
#include "internal.h"
@@ -379,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
@@ -3991,6 +3994,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_dec(&nr_namespaces_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
@@ -6491,6 +6496,7 @@ static void perf_event_task(struct task_struct *task,
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
+ perf_event_namespaces(task);
}
/*
@@ -6593,6 +6599,132 @@ void perf_event_comm(struct task_struct *task, bool exec)
}
/*
+ * namespaces tracking
+ */
+
+struct perf_namespaces_event {
+ struct task_struct *task;
+
+ struct {
+ struct perf_event_header header;
+
+ u32 pid;
+ u32 tid;
+ u64 nr_namespaces;
+ struct perf_ns_link_info link_info[NR_NAMESPACES];
+ } event_id;
+};
+
+static int perf_event_namespaces_match(struct perf_event *event)
+{
+ return event->attr.namespaces;
+}
+
+static void perf_event_namespaces_output(struct perf_event *event,
+ void *data)
+{
+ struct perf_namespaces_event *namespaces_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_namespaces_match(event))
+ return;
+
+ perf_event_header__init_id(&namespaces_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ namespaces_event->event_id.header.size);
+ if (ret)
+ return;
+
+ namespaces_event->event_id.pid = perf_event_pid(event,
+ namespaces_event->task);
+ namespaces_event->event_id.tid = perf_event_tid(event,
+ namespaces_event->task);
+
+ perf_output_put(&handle, namespaces_event->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
+ struct task_struct *task,
+ const struct proc_ns_operations *ns_ops)
+{
+ struct path ns_path;
+ struct inode *ns_inode;
+ void *error;
+
+ error = ns_get_path(&ns_path, task, ns_ops);
+ if (!error) {
+ ns_inode = ns_path.dentry->d_inode;
+ ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
+ ns_link_info->ino = ns_inode->i_ino;
+ }
+}
+
+void perf_event_namespaces(struct task_struct *task)
+{
+ struct perf_namespaces_event namespaces_event;
+ struct perf_ns_link_info *ns_link_info;
+
+ if (!atomic_read(&nr_namespaces_events))
+ return;
+
+ namespaces_event = (struct perf_namespaces_event){
+ .task = task,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_NAMESPACES,
+ .misc = 0,
+ .size = sizeof(namespaces_event.event_id),
+ },
+ /* .pid */
+ /* .tid */
+ .nr_namespaces = NR_NAMESPACES,
+ /* .link_info[NR_NAMESPACES] */
+ },
+ };
+
+ ns_link_info = namespaces_event.event_id.link_info;
+
+ perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
+ task, &mntns_operations);
+
+#ifdef CONFIG_USER_NS
+ perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
+ task, &userns_operations);
+#endif
+#ifdef CONFIG_NET_NS
+ perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
+ task, &netns_operations);
+#endif
+#ifdef CONFIG_UTS_NS
+ perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
+ task, &utsns_operations);
+#endif
+#ifdef CONFIG_IPC_NS
+ perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
+ task, &ipcns_operations);
+#endif
+#ifdef CONFIG_PID_NS
+ perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
+ task, &pidns_operations);
+#endif
+#ifdef CONFIG_CGROUPS
+ perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
+ task, &cgroupns_operations);
+#endif
+
+ perf_iterate_sb(perf_event_namespaces_output,
+ &namespaces_event,
+ NULL);
+}
+
+/*
* mmap tracking
*/
@@ -9146,6 +9278,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
+ if (event->attr.namespaces)
+ atomic_inc(&nr_namespaces_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq)
@@ -9691,6 +9825,11 @@ SYSCALL_DEFINE5(perf_event_open,
return -EACCES;
}
+ if (attr.namespaces) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
+
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 257fa460b846..2831480c63a2 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -297,6 +297,19 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
rb->paused = 1;
}
+void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
+{
+ /*
+ * OVERWRITE is determined by perf_aux_output_end() and can't
+ * be passed in directly.
+ */
+ if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
+ return;
+
+ handle->aux_flags |= flags;
+}
+EXPORT_SYMBOL_GPL(perf_aux_output_flag);
+
/*
* This is called before hardware starts writing to the AUX area to
* obtain an output handle and make sure there's room in the buffer.
@@ -360,6 +373,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
handle->event = event;
handle->head = aux_head;
handle->size = 0;
+ handle->aux_flags = 0;
/*
* In overwrite mode, AUX data stores do not depend on aux_tail,
@@ -408,34 +422,32 @@ err:
* of the AUX buffer management code is that after pmu::stop(), the AUX
* transaction must be stopped and therefore drop the AUX reference count.
*/
-void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
- bool truncated)
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
{
+ bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
struct ring_buffer *rb = handle->rb;
- bool wakeup = truncated;
unsigned long aux_head;
- u64 flags = 0;
-
- if (truncated)
- flags |= PERF_AUX_FLAG_TRUNCATED;
/* in overwrite mode, driver provides aux_head via handle */
if (rb->aux_overwrite) {
- flags |= PERF_AUX_FLAG_OVERWRITE;
+ handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
aux_head = handle->head;
local_set(&rb->aux_head, aux_head);
} else {
+ handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
+
aux_head = local_read(&rb->aux_head);
local_add(size, &rb->aux_head);
}
- if (size || flags) {
+ if (size || handle->aux_flags) {
/*
* Only send RECORD_AUX if we have something useful to communicate
*/
- perf_event_aux_event(handle->event, aux_head, size, flags);
+ perf_event_aux_event(handle->event, aux_head, size,
+ handle->aux_flags);
}
aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
@@ -446,7 +458,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
}
if (wakeup) {
- if (truncated)
+ if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
handle->event->pending_disable = 1;
perf_output_wakeup(handle);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 3d32513d6c73..65b27c3d3188 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,6 +1438,7 @@ static void rt_mutex_init_task(struct task_struct *p)
#ifdef CONFIG_RT_MUTEXES
p->pi_waiters = RB_ROOT;
p->pi_waiters_leftmost = NULL;
+ p->pi_top_task = NULL;
p->pi_blocked_on = NULL;
#endif
}
@@ -2357,6 +2358,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
}
}
+ perf_event_namespaces(current);
+
bad_unshare_cleanup_cred:
if (new_cred)
put_cred(new_cred);
diff --git a/kernel/futex.c b/kernel/futex.c
index 45858ec73941..c3eebcdac206 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -802,7 +802,7 @@ static int refill_pi_state_cache(void)
return 0;
}
-static struct futex_pi_state * alloc_pi_state(void)
+static struct futex_pi_state *alloc_pi_state(void)
{
struct futex_pi_state *pi_state = current->pi_state_cache;
@@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void)
return pi_state;
}
+static void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+}
+
/*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
@@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* Look up the task based on what TID userspace gave us.
* We dont trust it.
*/
-static struct task_struct * futex_find_get_task(pid_t pid)
+static struct task_struct *futex_find_get_task(pid_t pid)
{
struct task_struct *p;
@@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
pi_state->owner = NULL;
raw_spin_unlock_irq(&curr->pi_lock);
- rt_mutex_unlock(&pi_state->pi_mutex);
-
+ get_pi_state(pi_state);
spin_unlock(&hb->lock);
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
@@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr)
*
* [10] There is no transient state which leaves owner and user space
* TID out of sync.
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
*/
/*
@@ -980,10 +1020,12 @@ void exit_pi_state_list(struct task_struct *curr)
* the pi_state against the user space value. If correct, attach to
* it.
*/
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
+ int ret, uval2;
/*
* Userspace might have messed up non-PI and PI futexes [3]
@@ -991,9 +1033,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
if (unlikely(!pi_state))
return -EINVAL;
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
+ */
WARN_ON(!atomic_read(&pi_state->refcount));
/*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
* Handle the owner died case:
*/
if (uval & FUTEX_OWNER_DIED) {
@@ -1008,11 +1080,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* is not 0. Inconsistent state. [5]
*/
if (pid)
- return -EINVAL;
+ goto out_einval;
/*
* Take a ref on the state and return success. [4]
*/
- goto out_state;
+ goto out_attach;
}
/*
@@ -1024,14 +1096,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* Take a ref on the state and return success. [6]
*/
if (!pid)
- goto out_state;
+ goto out_attach;
} else {
/*
* If the owner died bit is not set, then the pi_state
* must have an owner. [7]
*/
if (!pi_state->owner)
- return -EINVAL;
+ goto out_einval;
}
/*
@@ -1040,11 +1112,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* user space TID. [9/10]
*/
if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-out_state:
- atomic_inc(&pi_state->refcount);
+ goto out_einval;
+
+out_attach:
+ get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
/*
@@ -1095,6 +1185,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
/*
* No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
*/
pi_state = alloc_pi_state();
@@ -1119,17 +1212,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
return 0;
}
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps)
{
- struct futex_q *match = futex_top_waiter(hb, key);
+ struct futex_q *top_waiter = futex_top_waiter(hb, key);
/*
* If there is a waiter on that futex, validate it and
* attach to the pi_state when the validation succeeds.
*/
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/*
* We are the first waiter - try to look up the owner based on
@@ -1148,7 +1242,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
- /*If user space value changed, let the caller retry */
+ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
}
@@ -1176,7 +1270,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct task_struct *task, int set_waiters)
{
u32 uval, newval, vpid = task_pid_vnr(task);
- struct futex_q *match;
+ struct futex_q *top_waiter;
int ret;
/*
@@ -1202,9 +1296,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
* Lookup existing state first. If it exists, try to attach to
* its pi_state.
*/
- match = futex_top_waiter(hb, key);
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ top_waiter = futex_top_waiter(hb, key);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/*
* No waiter and user TID is 0. We are here because the
@@ -1290,45 +1384,38 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
* memory barrier is required here to prevent the following
* store to lock_ptr from getting ahead of the plist_del.
*/
- smp_wmb();
- q->lock_ptr = NULL;
+ smp_store_release(&q->lock_ptr, NULL);
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
- struct futex_hash_bucket *hb)
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- struct task_struct *new_owner;
- struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ struct task_struct *new_owner;
+ bool postunlock = false;
DEFINE_WAKE_Q(wake_q);
- bool deboost;
int ret = 0;
- if (!pi_state)
- return -EINVAL;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- return -EINVAL;
-
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+ if (WARN_ON_ONCE(!new_owner)) {
+ /*
+ * As per the comment in futex_unlock_pi() this should not happen.
+ *
+ * When this happens, give up our locks and try again, giving
+ * the futex_lock_pi() instance time to complete, either by
+ * waiting on the rtmutex or removing itself from the futex
+ * queue.
+ */
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
/*
- * It is possible that the next waiter (the one that brought
- * this owner to the kernel) timed out and is no longer
- * waiting on the lock.
- */
- if (!new_owner)
- new_owner = this->task;
-
- /*
- * We pass it to the next owner. The WAITERS bit is always
- * kept enabled while there is PI state around. We cleanup the
- * owner died bit, because we are the owner.
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
@@ -1337,6 +1424,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
+
} else if (curval != uval) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
@@ -1349,10 +1437,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
else
ret = -EINVAL;
}
- if (ret) {
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
+
+ if (ret)
+ goto out_unlock;
raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
@@ -1365,22 +1452,18 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
pi_state->owner = new_owner;
raw_spin_unlock(&new_owner->pi_lock);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-
- deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
-
/*
- * First unlock HB so the waiter does not spin on it once he got woken
- * up. Second wake up the waiter before the priority is adjusted. If we
- * deboost first (and lose our higher priority), then the task might get
- * scheduled away before the wake up can take place.
+ * We've updated the uservalue, this unlock cannot fail.
*/
- spin_unlock(&hb->lock);
- wake_up_q(&wake_q);
- if (deboost)
- rt_mutex_adjust_prio(current);
+ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
- return 0;
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wake_q);
+
+ return ret;
}
/*
@@ -1826,7 +1909,7 @@ retry_private:
* If that call succeeds then we have pi_state and an
* initial refcount on it.
*/
- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
}
switch (ret) {
@@ -1909,7 +1992,7 @@ retry_private:
* refcount on the pi_state and store the pointer in
* the futex_q object of the waiter.
*/
- atomic_inc(&pi_state->refcount);
+ get_pi_state(pi_state);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
@@ -2009,20 +2092,7 @@ queue_unlock(struct futex_hash_bucket *hb)
hb_waiters_dec(hb);
}
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
@@ -2039,6 +2109,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
q->task = current;
+}
+
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me(). The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ __queue_me(q, hb);
spin_unlock(&hb->lock);
}
@@ -2125,10 +2213,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner = pi_state->owner;
u32 uval, uninitialized_var(curval), newval;
+ struct task_struct *oldowner;
int ret;
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ oldowner = pi_state->owner;
/* Owner died? */
if (!pi_state->owner)
newtid |= FUTEX_OWNER_DIED;
@@ -2136,7 +2227,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
/*
* We are here either because we stole the rtmutex from the
* previous highest priority waiter or we are the highest priority
- * waiter but failed to get the rtmutex the first time.
+ * waiter but have failed to get the rtmutex the first time.
+ *
* We have to replace the newowner TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
@@ -2144,17 +2236,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow.
*
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID check in lookup_pi_state.
*/
retry:
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
- while (1) {
+ for (;;) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2169,47 +2260,60 @@ retry:
* itself.
*/
if (pi_state->owner != NULL) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ raw_spin_unlock(&pi_state->owner->pi_lock);
}
pi_state->owner = newowner;
- raw_spin_lock_irq(&newowner->pi_lock);
+ raw_spin_lock(&newowner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &newowner->pi_state_list);
- raw_spin_unlock_irq(&newowner->pi_lock);
+ raw_spin_unlock(&newowner->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
return 0;
/*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the highest priority
- * waiter itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
+ * To handle the page fault we need to drop the locks here. That gives
+ * the other task (either the highest priority waiter itself or the
+ * task which stole the rtmutex) the chance to try the fixup of the
+ * pi_state. So once we are back from handling the fault we need to
+ * check the pi_state after reacquiring the locks and before trying to
+ * do another fixup. When the fixup has been done already we simply
+ * return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
handle_fault:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);
ret = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Check if someone else fixed it for us:
*/
- if (pi_state->owner != oldowner)
- return 0;
+ if (pi_state->owner != oldowner) {
+ ret = 0;
+ goto out_unlock;
+ }
if (ret)
- return ret;
+ goto out_unlock;
goto retry;
+
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
static long futex_wait_restart(struct restart_block *restart);
@@ -2231,13 +2335,16 @@ static long futex_wait_restart(struct restart_block *restart);
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
- struct task_struct *owner;
int ret = 0;
if (locked) {
/*
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
+ *
+ * We can safely read pi_state->owner without holding wait_lock
+ * because we now own the rt_mutex, only the owner will attempt
+ * to change it.
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2245,43 +2352,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
}
/*
- * Catch the rare case, where the lock was released when we were on the
- * way back before we locked the hash bucket.
- */
- if (q->pi_state->owner == current) {
- /*
- * Try to get the rt_mutex now. This might fail as some other
- * task acquired the rt_mutex after we removed ourself from the
- * rt_mutex waiters list.
- */
- if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
- locked = 1;
- goto out;
- }
-
- /*
- * pi_state is incorrect, some other task did a lock steal and
- * we returned due to timeout or signal without taking the
- * rt_mutex. Too late.
- */
- raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
- owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- if (!owner)
- owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
- ret = fixup_pi_state_owner(uaddr, q, owner);
- goto out;
- }
-
- /*
* Paranoia check. If we did not take the lock, then we should not be
* the owner of the rt_mutex.
*/
- if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
"pi-state %p\n", ret,
q->pi_state->pi_mutex.owner,
q->pi_state->owner);
+ }
out:
return ret ? ret : locked;
@@ -2505,6 +2584,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
+ struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int res, ret;
@@ -2557,25 +2638,68 @@ retry_private:
}
}
+ WARN_ON(!q.pi_state);
+
/*
* Only actually queue now that the atomic ops are done:
*/
- queue_me(&q, hb);
+ __queue_me(&q, hb);
- WARN_ON(!q.pi_state);
- /*
- * Block on the PI mutex:
- */
- if (!trylock) {
- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
- } else {
- ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
}
+ rt_mutex_init_waiter(&rt_waiter);
+
+ /*
+ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling rt_mutex_start_proxy_lock(). This still fully
+ * serializes against futex_unlock_pi() as that does the exact same
+ * lock handoff sequence.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ spin_unlock(q.lock_ptr);
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+
+ spin_lock(q.lock_ptr);
+ goto no_block;
+ }
+
+
+ if (unlikely(to))
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
spin_lock(q.lock_ptr);
/*
+ * If we failed to acquire the lock (signal/timeout), we must
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+ * wait lists consistent.
+ *
+ * In particular; it is important that futex_unlock_pi() can not
+ * observe this inconsistency.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+no_block:
+ /*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
@@ -2591,12 +2715,19 @@ retry_private:
* If fixup_owner() faulted and was unable to handle the fault, unlock
* it and return the fault to userspace.
*/
- if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
- rt_mutex_unlock(&q.pi_state->pi_mutex);
+ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
+ if (pi_state) {
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+ }
+
goto out_put_key;
out_unlock_put_key:
@@ -2633,7 +2764,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb;
- struct futex_q *match;
+ struct futex_q *top_waiter;
int ret;
retry:
@@ -2657,12 +2788,37 @@ retry:
* all and we at least want to know if user space fiddled
* with the futex value instead of blindly unlocking.
*/
- match = futex_top_waiter(hb, &key);
- if (match) {
- ret = wake_futex_pi(uaddr, uval, match, hb);
+ top_waiter = futex_top_waiter(hb, &key);
+ if (top_waiter) {
+ struct futex_pi_state *pi_state = top_waiter->pi_state;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
/*
- * In case of success wake_futex_pi dropped the hash
- * bucket lock.
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ get_pi_state(pi_state);
+ /*
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_pi() must observe a state consistent with what we
+ * observed.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
+ put_pi_state(pi_state);
+
+ /*
+ * Success, we're done! No tricky corner cases.
*/
if (!ret)
goto out_putkey;
@@ -2677,7 +2833,6 @@ retry:
* setting the FUTEX_WAITERS bit. Try again.
*/
if (ret == -EAGAIN) {
- spin_unlock(&hb->lock);
put_futex_key(&key);
goto retry;
}
@@ -2685,7 +2840,7 @@ retry:
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
- goto out_unlock;
+ goto out_putkey;
}
/*
@@ -2695,8 +2850,10 @@ retry:
* preserve the WAITERS bit not the OWNER_DIED one. We are the
* owner.
*/
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+ spin_unlock(&hb->lock);
goto pi_faulted;
+ }
/*
* If uval has changed, let user space handle it.
@@ -2710,7 +2867,6 @@ out_putkey:
return ret;
pi_faulted:
- spin_unlock(&hb->lock);
put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
@@ -2814,6 +2970,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 __user *uaddr2)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
@@ -2840,10 +2997,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* The waiter is allocated on our stack, manipulated by the requeue
* code while we sleep on uaddr.
*/
- debug_rt_mutex_init_waiter(&rt_waiter);
- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
- RB_CLEAR_NODE(&rt_waiter.tree_entry);
- rt_waiter.task = NULL;
+ rt_mutex_init_waiter(&rt_waiter);
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
@@ -2898,8 +3052,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
- rt_mutex_unlock(&q.pi_state->pi_mutex);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
@@ -2917,10 +3073,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
- debug_rt_mutex_free_waiter(&rt_waiter);
+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
spin_lock(q.lock_ptr);
+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+ ret = 0;
+
+ debug_rt_mutex_free_waiter(&rt_waiter);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
@@ -2938,13 +3097,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* the fault, unlock the rt_mutex and return the fault to
* userspace.
*/
- if (ret && rt_mutex_owner(pi_mutex) == current)
- rt_mutex_unlock(pi_mutex);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/* Unqueue and drop the lock. */
unqueue_me_pi(&q);
}
+ if (pi_state) {
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+ }
+
if (ret == -EINTR) {
/*
* We've already been requeued, but cannot restart by calling
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4544b115f5eb..dc529116f7e6 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -59,7 +59,7 @@ static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
struct cpumask *
irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
{
- int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec;
+ int n, nodes, cpus_per_vec, extra_vecs, curvec;
int affv = nvecs - affd->pre_vectors - affd->post_vectors;
int last_affv = affv + affd->pre_vectors;
nodemask_t nodemsk = NODE_MASK_NONE;
@@ -94,19 +94,21 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
goto done;
}
- /* Spread the vectors per node */
- vecs_per_node = affv / nodes;
- /* Account for rounding errors */
- extra_vecs = affv - (nodes * vecs_per_node);
-
for_each_node_mask(n, nodemsk) {
- int ncpus, v, vecs_to_assign = vecs_per_node;
+ int ncpus, v, vecs_to_assign, vecs_per_node;
+
+ /* Spread the vectors per node */
+ vecs_per_node = (affv - curvec) / nodes;
/* Get the cpus on this node which are in the mask */
cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n));
/* Calculate the number of cpus per vector */
ncpus = cpumask_weight(nmsk);
+ vecs_to_assign = min(vecs_per_node, ncpus);
+
+ /* Account for rounding errors */
+ extra_vecs = ncpus - vecs_to_assign;
for (v = 0; curvec < last_affv && v < vecs_to_assign;
curvec++, v++) {
@@ -115,14 +117,14 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
/* Account for extra vectors to compensate rounding errors */
if (extra_vecs) {
cpus_per_vec++;
- if (!--extra_vecs)
- vecs_per_node++;
+ --extra_vecs;
}
irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
}
if (curvec >= last_affv)
break;
+ --nodes;
}
done:
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index be3c34e4f2ac..686be4b73018 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -348,7 +348,10 @@ void handle_nested_irq(unsigned int irq)
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock_irq(&desc->lock);
- action_ret = action->thread_fn(action->irq, action->dev_id);
+ action_ret = IRQ_NONE;
+ for_each_action_of_desc(desc, action)
+ action_ret |= action->thread_fn(action->irq, action->dev_id);
+
if (!noirqdebug)
note_interrupt(desc, action_ret);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 699c5bc51a92..d733479a10ee 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1391,21 +1391,19 @@ bool within_kprobe_blacklist(unsigned long addr)
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
+ const char *symbol_name, unsigned int offset)
{
- kprobe_opcode_t *addr = p->addr;
-
- if ((p->symbol_name && p->addr) ||
- (!p->symbol_name && !p->addr))
+ if ((symbol_name && addr) || (!symbol_name && !addr))
goto invalid;
- if (p->symbol_name) {
- kprobe_lookup_name(p->symbol_name, addr);
+ if (symbol_name) {
+ kprobe_lookup_name(symbol_name, addr);
if (!addr)
return ERR_PTR(-ENOENT);
}
- addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
+ addr = (kprobe_opcode_t *)(((char *)addr) + offset);
if (addr)
return addr;
@@ -1413,6 +1411,11 @@ invalid:
return ERR_PTR(-EINVAL);
}
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+{
+ return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+}
+
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
@@ -1740,11 +1743,12 @@ void unregister_kprobes(struct kprobe **kps, int num)
}
EXPORT_SYMBOL_GPL(unregister_kprobes);
-int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self,
- unsigned long val, void *data)
+int __weak kprobe_exceptions_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
return NOTIFY_DONE;
}
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
static struct notifier_block kprobe_exceptions_nb = {
.notifier_call = kprobe_exceptions_notify,
@@ -1875,6 +1879,25 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
+bool __weak arch_function_offset_within_entry(unsigned long offset)
+{
+ return !offset;
+}
+
+bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+{
+ kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+
+ if (IS_ERR(kp_addr))
+ return false;
+
+ if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
+ !arch_function_offset_within_entry(offset))
+ return false;
+
+ return true;
+}
+
int register_kretprobe(struct kretprobe *rp)
{
int ret = 0;
@@ -1882,6 +1905,9 @@ int register_kretprobe(struct kretprobe *rp)
int i;
void *addr;
+ if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
+ return -EINVAL;
+
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
if (IS_ERR(addr))
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 34e97d970761..dc3297895ce3 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -660,6 +660,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
+ bool is_static = false;
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
debug_locks_off();
@@ -673,10 +674,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
/*
* Static locks do not have their class-keys yet - for them the key
- * is the lock object itself:
+ * is the lock object itself. If the lock is in the per cpu area,
+ * the canonical address of the lock (per cpu offset removed) is
+ * used.
*/
- if (unlikely(!lock->key))
- lock->key = (void *)lock;
+ if (unlikely(!lock->key)) {
+ unsigned long can_addr, addr = (unsigned long)lock;
+
+ if (__is_kernel_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (__is_module_percpu_address(addr, &can_addr))
+ lock->key = (void *)can_addr;
+ else if (static_obj(lock))
+ lock->key = (void *)lock;
+ else
+ return ERR_PTR(-EINVAL);
+ is_static = true;
+ }
/*
* NOTE: the class-key must be unique. For dynamic locks, a static
@@ -708,7 +722,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
}
}
- return NULL;
+ return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
}
/*
@@ -726,19 +740,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
class = look_up_lock_class(lock, subclass);
- if (likely(class))
+ if (likely(!IS_ERR_OR_NULL(class)))
goto out_set_class_cache;
/*
* Debug-check: all keys must be persistent!
- */
- if (!static_obj(lock->key)) {
+ */
+ if (IS_ERR(class)) {
debug_locks_off();
printk("INFO: trying to register non-static key.\n");
printk("the code is fine but needs lockdep annotation.\n");
printk("turning off the locking correctness validator.\n");
dump_stack();
-
return NULL;
}
@@ -3419,7 +3432,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
* Clearly if the lock hasn't been acquired _ever_, we're not
* holding it either, so report failure.
*/
- if (!class)
+ if (IS_ERR_OR_NULL(class))
return 0;
/*
@@ -3437,13 +3450,67 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
return 0;
}
+/* @depth must not be zero */
+static struct held_lock *find_held_lock(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned int depth, int *idx)
+{
+ struct held_lock *ret, *hlock, *prev_hlock;
+ int i;
+
+ i = depth - 1;
+ hlock = curr->held_locks + i;
+ ret = hlock;
+ if (match_held_lock(hlock, lock))
+ goto out;
+
+ ret = NULL;
+ for (i--, prev_hlock = hlock--;
+ i >= 0;
+ i--, prev_hlock = hlock--) {
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock->irq_context != hlock->irq_context) {
+ ret = NULL;
+ break;
+ }
+ if (match_held_lock(hlock, lock)) {
+ ret = hlock;
+ break;
+ }
+ }
+
+out:
+ *idx = i;
+ return ret;
+}
+
+static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
+ int idx)
+{
+ struct held_lock *hlock;
+
+ for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
+ if (!__lock_acquire(hlock->instance,
+ hlock_class(hlock)->subclass,
+ hlock->trylock,
+ hlock->read, hlock->check,
+ hlock->hardirqs_off,
+ hlock->nest_lock, hlock->acquire_ip,
+ hlock->references, hlock->pin_count))
+ return 1;
+ }
+ return 0;
+}
+
static int
__lock_set_class(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, unsigned int subclass,
unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
struct lock_class *class;
unsigned int depth;
int i;
@@ -3456,21 +3523,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
if (DEBUG_LOCKS_WARN_ON(!depth))
return 0;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
- }
- return print_unlock_imbalance_bug(curr, lock, ip);
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
-found_it:
lockdep_init_map(lock, name, key, 0);
class = register_lock_class(lock, subclass, 0);
hlock->class_idx = class - lock_classes + 1;
@@ -3478,15 +3534,46 @@ found_it:
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- for (; i < depth; i++) {
- hlock = curr->held_locks + i;
- if (!__lock_acquire(hlock->instance,
- hlock_class(hlock)->subclass, hlock->trylock,
- hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count))
- return 0;
- }
+ if (reacquire_held_locks(curr, depth, i))
+ return 0;
+
+ /*
+ * I took it apart and put it back together again, except now I have
+ * these 'spare' parts.. where shall I put them.
+ */
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ return 0;
+ return 1;
+}
+
+static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ unsigned int depth;
+ int i;
+
+ depth = curr->lockdep_depth;
+ /*
+ * This function is about (re)setting the class of a held lock,
+ * yet we're not actually holding any locks. Naughty user!
+ */
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
+
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ WARN(hlock->read, "downgrading a read lock");
+ hlock->read = 1;
+ hlock->acquire_ip = ip;
+
+ if (reacquire_held_locks(curr, depth, i))
+ return 0;
/*
* I took it apart and put it back together again, except now I have
@@ -3508,7 +3595,7 @@ static int
__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
unsigned int depth;
int i;
@@ -3527,21 +3614,10 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
* Check whether the lock exists in the current stack
* of held locks:
*/
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
- }
- return print_unlock_imbalance_bug(curr, lock, ip);
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock)
+ return print_unlock_imbalance_bug(curr, lock, ip);
-found_it:
if (hlock->instance == lock)
lock_release_holdtime(hlock);
@@ -3568,15 +3644,8 @@ found_it:
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- for (i++; i < depth; i++) {
- hlock = curr->held_locks + i;
- if (!__lock_acquire(hlock->instance,
- hlock_class(hlock)->subclass, hlock->trylock,
- hlock->read, hlock->check, hlock->hardirqs_off,
- hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count))
- return 0;
- }
+ if (reacquire_held_locks(curr, depth, i + 1))
+ return 0;
/*
* We had N bottles of beer on the wall, we drank one, but now
@@ -3741,6 +3810,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
}
EXPORT_SYMBOL_GPL(lock_set_class);
+void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ current->lockdep_recursion = 1;
+ check_flags(flags);
+ if (__lock_downgrade(lock, ip))
+ check_chain_key(current);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_downgrade);
+
/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
@@ -3905,7 +3991,7 @@ static void
__lock_contended(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
struct lock_class_stats *stats;
unsigned int depth;
int i, contention_point, contending_point;
@@ -3918,22 +4004,12 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_lock_contention_bug(curr, lock, ip);
+ return;
}
- print_lock_contention_bug(curr, lock, ip);
- return;
-found_it:
if (hlock->instance != lock)
return;
@@ -3957,7 +4033,7 @@ static void
__lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
- struct held_lock *hlock, *prev_hlock;
+ struct held_lock *hlock;
struct lock_class_stats *stats;
unsigned int depth;
u64 now, waittime = 0;
@@ -3971,22 +4047,12 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
- prev_hlock = NULL;
- for (i = depth-1; i >= 0; i--) {
- hlock = curr->held_locks + i;
- /*
- * We must not cross into another context:
- */
- if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
- break;
- if (match_held_lock(hlock, lock))
- goto found_it;
- prev_hlock = hlock;
+ hlock = find_held_lock(curr, lock, depth, &i);
+ if (!hlock) {
+ print_lock_contention_bug(curr, lock, _RET_IP_);
+ return;
}
- print_lock_contention_bug(curr, lock, _RET_IP_);
- return;
-found_it:
if (hlock->instance != lock)
return;
@@ -4174,7 +4240,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
* If the class exists we look it up and zap it:
*/
class = look_up_lock_class(lock, j);
- if (class)
+ if (!IS_ERR_OR_NULL(class))
zap_class(class);
}
/*
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 97ee9df32e0f..32fe775a2eaf 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -174,12 +174,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
lock->name = name;
}
-void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
-{
-}
-
-void rt_mutex_deadlock_account_unlock(struct task_struct *task)
-{
-}
-
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index d0519c3432b6..b585af9a1b50 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -9,9 +9,6 @@
* This file contains macros used solely by rtmutex.c. Debug version.
*/
-extern void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
-extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6edc32ecd9c5..0e641eb473de 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -224,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
}
#endif
+/*
+ * Only use with rt_mutex_waiter_{less,equal}()
+ */
+#define task_to_waiter(p) \
+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline }
+
static inline int
rt_mutex_waiter_less(struct rt_mutex_waiter *left,
struct rt_mutex_waiter *right)
@@ -238,12 +244,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
* then right waiter has a dl_prio() too.
*/
if (dl_prio(left->prio))
- return dl_time_before(left->task->dl.deadline,
- right->task->dl.deadline);
+ return dl_time_before(left->deadline, right->deadline);
return 0;
}
+static inline int
+rt_mutex_waiter_equal(struct rt_mutex_waiter *left,
+ struct rt_mutex_waiter *right)
+{
+ if (left->prio != right->prio)
+ return 0;
+
+ /*
+ * If both waiters have dl_prio(), we check the deadlines of the
+ * associated tasks.
+ * If left waiter has a dl_prio(), and we didn't return 0 above,
+ * then right waiter has a dl_prio() too.
+ */
+ if (dl_prio(left->prio))
+ return left->deadline == right->deadline;
+
+ return 1;
+}
+
static void
rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter)
{
@@ -322,72 +346,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
RB_CLEAR_NODE(&waiter->pi_tree_entry);
}
-/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
- if (likely(!task_has_pi_waiters(task)))
- return task->normal_prio;
-
- return min(task_top_pi_waiter(task)->prio,
- task->normal_prio);
-}
-
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
+static void rt_mutex_adjust_prio(struct task_struct *p)
{
- if (likely(!task_has_pi_waiters(task)))
- return NULL;
-
- return task_top_pi_waiter(task)->task;
-}
+ struct task_struct *pi_task = NULL;
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
-{
- if (!task_has_pi_waiters(task))
- return newprio;
+ lockdep_assert_held(&p->pi_lock);
- if (task_top_pi_waiter(task)->task->prio <= newprio)
- return task_top_pi_waiter(task)->task->prio;
- return newprio;
-}
+ if (task_has_pi_waiters(p))
+ pi_task = task_top_pi_waiter(p)->task;
-/*
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
-{
- int prio = rt_mutex_getprio(task);
-
- if (task->prio != prio || dl_prio(prio))
- rt_mutex_setprio(task, prio);
-}
-
-/*
- * Adjust task priority (undo boosting). Called from the exit path of
- * rt_mutex_slowunlock() and rt_mutex_slowlock().
- *
- * (Note: We do this outside of the protection of lock->wait_lock to
- * allow the lock to be taken while or before we readjust the priority
- * of task. We do not use the spin_xx_mutex() variants here as we are
- * outside of the debug path.)
- */
-void rt_mutex_adjust_prio(struct task_struct *task)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
- __rt_mutex_adjust_prio(task);
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ rt_mutex_setprio(p, pi_task);
}
/*
@@ -610,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* enabled we continue, but stop the requeueing in the chain
* walk.
*/
- if (waiter->prio == task->prio) {
+ if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
if (!detect_deadlock)
goto out_unlock_pi;
else
@@ -706,7 +674,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/* [7] Requeue the waiter in the lock waiter tree. */
rt_mutex_dequeue(lock, waiter);
+
+ /*
+ * Update the waiter prio fields now that we're dequeued.
+ *
+ * These values can have changed through either:
+ *
+ * sys_sched_set_scheduler() / sys_sched_setattr()
+ *
+ * or
+ *
+ * DL CBS enforcement advancing the effective deadline.
+ *
+ * Even though pi_waiters also uses these fields, and that tree is only
+ * updated in [11], we can do this here, since we hold [L], which
+ * serializes all pi_waiters access and rb_erase() does not care about
+ * the values of the node being removed.
+ */
waiter->prio = task->prio;
+ waiter->deadline = task->dl.deadline;
+
rt_mutex_enqueue(lock, waiter);
/* [8] Release the task */
@@ -747,7 +734,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
} else if (prerequeue_top_waiter == waiter) {
/*
@@ -763,7 +750,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
rt_mutex_enqueue_pi(task, waiter);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
} else {
/*
* Nothing changed. No need to do any priority
@@ -833,6 +820,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
+ lockdep_assert_held(&lock->wait_lock);
+
/*
* Before testing whether we can acquire @lock, we set the
* RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -892,7 +881,8 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* the top waiter priority (kernel view),
* @task lost.
*/
- if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+ if (!rt_mutex_waiter_less(task_to_waiter(task),
+ rt_mutex_top_waiter(lock)))
return 0;
/*
@@ -938,8 +928,6 @@ takeit:
*/
rt_mutex_set_owner(lock, task);
- rt_mutex_deadlock_account_lock(lock, task);
-
return 1;
}
@@ -960,6 +948,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex *next_lock;
int chain_walk = 0, res;
+ lockdep_assert_held(&lock->wait_lock);
+
/*
* Early deadlock detection. We really don't want the task to
* enqueue on itself just to untangle the mess later. It's not
@@ -973,10 +963,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
- __rt_mutex_adjust_prio(task);
+ rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
waiter->prio = task->prio;
+ waiter->deadline = task->dl.deadline;
/* Get the top priority waiter on the lock */
if (rt_mutex_has_waiters(lock))
@@ -995,7 +986,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1047,12 +1038,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
waiter = rt_mutex_top_waiter(lock);
/*
- * Remove it from current->pi_waiters. We do not adjust a
- * possible priority boost right now. We execute wakeup in the
- * boosted mode and go back to normal after releasing
- * lock->wait_lock.
+ * Remove it from current->pi_waiters and deboost.
+ *
+ * We must in fact deboost here in order to ensure we call
+ * rt_mutex_setprio() to update p->pi_top_task before the
+ * task unblocks.
*/
rt_mutex_dequeue_pi(current, waiter);
+ rt_mutex_adjust_prio(current);
/*
* As we are waking up the top waiter, and the waiter stays
@@ -1064,9 +1057,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
- raw_spin_unlock(&current->pi_lock);
-
+ /*
+ * We deboosted before waking the top waiter task such that we don't
+ * run two tasks with the 'same' priority (and ensure the
+ * p->pi_top_task pointer points to a blocked task). This however can
+ * lead to priority inversion if we would get preempted after the
+ * deboost but before waking our donor task, hence the preempt_disable()
+ * before unlock.
+ *
+ * Pairs with preempt_enable() in rt_mutex_postunlock();
+ */
+ preempt_disable();
wake_q_add(wake_q, waiter->task);
+ raw_spin_unlock(&current->pi_lock);
}
/*
@@ -1082,6 +1085,8 @@ static void remove_waiter(struct rt_mutex *lock,
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex *next_lock;
+ lockdep_assert_held(&lock->wait_lock);
+
raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
@@ -1101,7 +1106,7 @@ static void remove_waiter(struct rt_mutex *lock,
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- __rt_mutex_adjust_prio(owner);
+ rt_mutex_adjust_prio(owner);
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
@@ -1140,8 +1145,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
- if (!waiter || (waiter->prio == task->prio &&
- !dl_prio(task->prio))) {
+ if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
@@ -1155,6 +1159,14 @@ void rt_mutex_adjust_pi(struct task_struct *task)
next_lock, NULL, task);
}
+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->task = NULL;
+}
+
/**
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
@@ -1237,9 +1249,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
unsigned long flags;
int ret = 0;
- debug_rt_mutex_init_waiter(&waiter);
- RB_CLEAR_NODE(&waiter.pi_tree_entry);
- RB_CLEAR_NODE(&waiter.tree_entry);
+ rt_mutex_init_waiter(&waiter);
/*
* Technically we could use raw_spin_[un]lock_irq() here, but this can
@@ -1330,7 +1340,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
/*
* Slow path to release a rt-mutex.
- * Return whether the current task needs to undo a potential priority boosting.
+ *
+ * Return whether the current task needs to call rt_mutex_postunlock().
*/
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
@@ -1342,8 +1353,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
debug_rt_mutex_unlock(lock);
- rt_mutex_deadlock_account_unlock(current);
-
/*
* We must be careful here if the fast path is enabled. If we
* have no waiters queued we cannot set owner to NULL here
@@ -1390,11 +1399,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
* Queue the next waiter for wakeup once we release the wait_lock.
*/
mark_wakeup_next_waiter(wake_q, lock);
-
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- /* check PI boosting */
- return true;
+ return true; /* call rt_mutex_postunlock() */
}
/*
@@ -1409,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
enum rtmutex_chainwalk chwalk))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
}
static inline int
@@ -1425,24 +1431,33 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk))
{
if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, timeout, chwalk);
+
+ return slowfn(lock, state, timeout, chwalk);
}
static inline int
rt_mutex_fasttrylock(struct rt_mutex *lock,
int (*slowfn)(struct rt_mutex *lock))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 1;
- }
+
return slowfn(lock);
}
+/*
+ * Performs the wakeup of the the top-waiter and re-enables preemption.
+ */
+void rt_mutex_postunlock(struct wake_q_head *wake_q)
+{
+ wake_up_q(wake_q);
+
+ /* Pairs with preempt_disable() in rt_mutex_slowunlock() */
+ preempt_enable();
+}
+
static inline void
rt_mutex_fastunlock(struct rt_mutex *lock,
bool (*slowfn)(struct rt_mutex *lock,
@@ -1450,18 +1465,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
{
DEFINE_WAKE_Q(wake_q);
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
-
- } else {
- bool deboost = slowfn(lock, &wake_q);
-
- wake_up_q(&wake_q);
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
- /* Undo pi boosting if necessary: */
- if (deboost)
- rt_mutex_adjust_prio(current);
- }
+ if (slowfn(lock, &wake_q))
+ rt_mutex_postunlock(&wake_q);
}
/**
@@ -1495,16 +1503,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/*
- * Futex variant with full deadlock detection.
+ * Futex variant, must not use fastpath.
*/
-int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *timeout)
+int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
{
- might_sleep();
-
- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_FULL_CHAINWALK,
- rt_mutex_slowlock);
+ return rt_mutex_slowtrylock(lock);
}
/**
@@ -1563,20 +1566,43 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
- * @lock: the rt_mutex to be unlocked
- *
- * Returns: true/false indicating whether priority adjustment is
- * required or not.
+ * Futex variant, that since futex variants do not use the fast-path, can be
+ * simple and will not need to retry.
*/
-bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh)
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wake_q)
{
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
- return false;
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
}
- return rt_mutex_slowunlock(lock, wqh);
+
+ mark_wakeup_next_waiter(wake_q, lock);
+ /*
+ * We've already deboosted, retain preempt_disabled when dropping
+ * the wait_lock to avoid inversion until the wakeup. Matched
+ * by rt_mutex_postunlock();
+ */
+ preempt_disable();
+
+ return true; /* call postunlock() */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+ DEFINE_WAKE_Q(wake_q);
+ bool postunlock;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ postunlock = __rt_mutex_futex_unlock(lock, &wake_q);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ if (postunlock)
+ rt_mutex_postunlock(&wake_q);
}
/**
@@ -1637,7 +1663,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
__rt_mutex_init(lock, NULL);
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
- rt_mutex_deadlock_account_lock(lock, proxy_owner);
}
/**
@@ -1657,34 +1682,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
{
debug_rt_mutex_proxy_unlock(lock);
rt_mutex_set_owner(lock, NULL);
- rt_mutex_deadlock_account_unlock(proxy_owner);
}
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for FUTEX_REQUEUE_PI support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task)
{
int ret;
- raw_spin_lock_irq(&lock->wait_lock);
-
- if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock_irq(&lock->wait_lock);
+ if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
- }
/* We enforce deadlock detection for futexes */
ret = task_blocks_on_rt_mutex(lock, waiter, task,
@@ -1703,14 +1710,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (unlikely(ret))
remove_waiter(lock, waiter);
- raw_spin_unlock_irq(&lock->wait_lock);
-
debug_rt_mutex_print_deadlock(waiter);
return ret;
}
/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
* rt_mutex_next_owner - return the next owner of the lock
*
* @lock: the rt lock query
@@ -1731,21 +1762,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
}
/**
- * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
* @lock: the rt_mutex we were woken on
* @to: the timeout, null if none. hrtimer should already have
* been started.
* @waiter: the pre-initialized rt_mutex_waiter
*
- * Complete the lock acquisition started our behalf by another thread.
+ * Wait for the the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
*
* Returns:
* 0 - success
* <0 - error, one of -EINTR, -ETIMEDOUT
*
- * Special API call for PI-futex requeue support
+ * Special API call for PI-futex support
*/
-int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
struct rt_mutex_waiter *waiter)
{
@@ -1758,8 +1791,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
/* sleep on the mutex */
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
- if (unlikely(ret))
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
remove_waiter(lock, waiter);
+ fixup_rt_mutex_waiters(lock);
+ cleanup = true;
+ }
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
@@ -1769,5 +1839,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
raw_spin_unlock_irq(&lock->wait_lock);
- return ret;
+ return cleanup;
}
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index c4060584c407..6607802efa8b 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -11,8 +11,6 @@
*/
#define rt_mutex_deadlock_check(l) (0)
-#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
-#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
#define debug_rt_mutex_init_waiter(w) do { } while (0)
#define debug_rt_mutex_free_waiter(w) do { } while (0)
#define debug_rt_mutex_lock(l) do { } while (0)
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 856dfff5c33a..72ad45a9a794 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -34,6 +34,7 @@ struct rt_mutex_waiter {
struct rt_mutex *deadlock_lock;
#endif
int prio;
+ u64 deadline;
};
/*
@@ -103,16 +104,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);
-extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
-extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
-extern void rt_mutex_adjust_prio(struct task_struct *task);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
+
+extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh);
+
+extern void rt_mutex_postunlock(struct wake_q_head *wake_q);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 90a74ccd85a4..4d48b1c4870d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write);
*/
void downgrade_write(struct rw_semaphore *sem)
{
- /*
- * lockdep: a downgraded write will live on as a write
- * dependency.
- */
+ lock_downgrade(&sem->dep_map, _RET_IP_);
+
rwsem_set_reader_owned(sem);
__downgrade_write(sem);
}
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 6b7abb334ca6..39f56c870051 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -353,8 +353,8 @@ static int test_cycle(unsigned int ncpus)
struct stress {
struct work_struct work;
struct ww_mutex *locks;
+ unsigned long timeout;
int nlocks;
- int nloops;
};
static int *get_random_order(int count)
@@ -398,12 +398,11 @@ static void stress_inorder_work(struct work_struct *work)
if (!order)
return;
- ww_acquire_init(&ctx, &ww_class);
-
do {
int contended = -1;
int n, err;
+ ww_acquire_init(&ctx, &ww_class);
retry:
err = 0;
for (n = 0; n < nlocks; n++) {
@@ -433,9 +432,9 @@ retry:
__func__, err);
break;
}
- } while (--stress->nloops);
- ww_acquire_fini(&ctx);
+ ww_acquire_fini(&ctx);
+ } while (!time_after(jiffies, stress->timeout));
kfree(order);
kfree(stress);
@@ -470,9 +469,9 @@ static void stress_reorder_work(struct work_struct *work)
kfree(order);
order = NULL;
- ww_acquire_init(&ctx, &ww_class);
-
do {
+ ww_acquire_init(&ctx, &ww_class);
+
list_for_each_entry(ll, &locks, link) {
err = ww_mutex_lock(ll->lock, &ctx);
if (!err)
@@ -495,9 +494,9 @@ static void stress_reorder_work(struct work_struct *work)
dummy_load(stress);
list_for_each_entry(ll, &locks, link)
ww_mutex_unlock(ll->lock);
- } while (--stress->nloops);
- ww_acquire_fini(&ctx);
+ ww_acquire_fini(&ctx);
+ } while (!time_after(jiffies, stress->timeout));
out:
list_for_each_entry_safe(ll, ln, &locks, link)
@@ -523,7 +522,7 @@ static void stress_one_work(struct work_struct *work)
__func__, err);
break;
}
- } while (--stress->nloops);
+ } while (!time_after(jiffies, stress->timeout));
kfree(stress);
}
@@ -533,7 +532,7 @@ static void stress_one_work(struct work_struct *work)
#define STRESS_ONE BIT(2)
#define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE)
-static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
+static int stress(int nlocks, int nthreads, unsigned int flags)
{
struct ww_mutex *locks;
int n;
@@ -575,7 +574,7 @@ static int stress(int nlocks, int nthreads, int nloops, unsigned int flags)
INIT_WORK(&stress->work, fn);
stress->locks = locks;
stress->nlocks = nlocks;
- stress->nloops = nloops;
+ stress->timeout = jiffies + 2*HZ;
queue_work(wq, &stress->work);
nthreads--;
@@ -619,15 +618,15 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER);
+ ret = stress(16, 2*ncpus, STRESS_INORDER);
if (ret)
return ret;
- ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER);
+ ret = stress(16, 2*ncpus, STRESS_REORDER);
if (ret)
return ret;
- ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL);
+ ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
if (ret)
return ret;
diff --git a/kernel/module.c b/kernel/module.c
index 23224d8ba00d..63321952c71c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -665,16 +665,7 @@ static void percpu_modcopy(struct module *mod,
memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
}
-/**
- * is_module_percpu_address - test whether address is from module static percpu
- * @addr: address to test
- *
- * Test whether @addr belongs to module static percpu area.
- *
- * RETURNS:
- * %true if @addr is from module static percpu area
- */
-bool is_module_percpu_address(unsigned long addr)
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
{
struct module *mod;
unsigned int cpu;
@@ -688,9 +679,15 @@ bool is_module_percpu_address(unsigned long addr)
continue;
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(mod->percpu, cpu);
-
- if ((void *)addr >= start &&
- (void *)addr < start + mod->percpu_size) {
+ void *va = (void *)addr;
+
+ if (va >= start && va < start + mod->percpu_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(mod->percpu,
+ get_boot_cpu_id());
+ }
preempt_enable();
return true;
}
@@ -701,6 +698,20 @@ bool is_module_percpu_address(unsigned long addr)
return false;
}
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+ return __is_module_percpu_address(addr, NULL);
+}
+
#else /* ... !CONFIG_SMP */
static inline void __percpu *mod_percpu(struct module *mod)
@@ -732,6 +743,11 @@ bool is_module_percpu_address(unsigned long addr)
return false;
}
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+ return false;
+}
+
#endif /* CONFIG_SMP */
#define MODINFO_ATTR(field) \
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e59eed..f6c5d330059a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/syscalls.h>
#include <linux/cgroup.h>
+#include <linux/perf_event.h>
static struct kmem_cache *nsproxy_cachep;
@@ -262,6 +263,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
goto out;
}
switch_task_namespaces(tsk, new_nsproxy);
+
+ perf_event_namespaces(tsk);
out:
fput(file);
return err;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc05a0f1..0c443bbdaaca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -86,21 +86,6 @@ int sysctl_sched_rt_runtime = 950000;
cpumask_var_t cpu_isolated_map;
/*
- * this_rq_lock - lock this runqueue and disable interrupts.
- */
-static struct rq *this_rq_lock(void)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- local_irq_disable();
- rq = this_rq();
- raw_spin_lock(&rq->lock);
-
- return rq;
-}
-
-/*
* __task_rq_lock - lock the rq @p resides on.
*/
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq)
return;
#ifdef CONFIG_SCHED_DEBUG
+ if (sched_feat(WARN_DOUBLE_CLOCK))
+ SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
#endif
+
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
if (delta < 0)
return;
@@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq)
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+ struct rq_flags rf;
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
}
@@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq)
static void __hrtick_start(void *arg)
{
struct rq *rq = arg;
+ struct rq_flags rf;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
__hrtick_restart(rq);
rq->hrtick_csd_pending = 0;
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p)
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
- update_rq_clock(rq);
+ if (!(flags & ENQUEUE_NOCLOCK))
+ update_rq_clock(rq);
+
if (!(flags & ENQUEUE_RESTORE))
sched_info_queued(rq, p);
+
p->sched_class->enqueue_task(rq, p, flags);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
- update_rq_clock(rq);
+ if (!(flags & DEQUEUE_NOCLOCK))
+ update_rq_clock(rq);
+
if (!(flags & DEQUEUE_SAVE))
sched_info_dequeued(rq, p);
+
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
*
* Returns (locked) new rq. Old rq's lock is released.
*/
-static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int new_cpu)
{
lockdep_assert_held(&rq->lock);
p->on_rq = TASK_ON_RQ_MIGRATING;
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
rq = cpu_rq(new_cpu);
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, rf);
BUG_ON(task_cpu(p) != new_cpu);
enqueue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -980,7 +977,8 @@ struct migration_arg {
* So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU.
*/
-static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
+ struct task_struct *p, int dest_cpu)
{
if (unlikely(!cpu_active(dest_cpu)))
return rq;
@@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
return rq;
- rq = move_queued_task(rq, p, dest_cpu);
+ update_rq_clock(rq);
+ rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
}
@@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data)
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
+ struct rq_flags rf;
/*
* The original target CPU might have gone down and we might
@@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data)
sched_ttwu_pending();
raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data)
*/
if (task_rq(p) == rq) {
if (task_on_rq_queued(p))
- rq = __migrate_task(rq, p, arg->dest_cpu);
+ rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
else
p->wake_cpu = arg->dest_cpu;
}
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
raw_spin_unlock(&p->pi_lock);
local_irq_enable();
@@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* holding rq->lock.
*/
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
put_prev_task(rq, p);
@@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
p->sched_class->set_cpus_allowed(p, new_mask);
if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_curr_task(rq, p);
}
@@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* OK, since we're going to drop the lock immediately
* afterwards anyway.
*/
- rq_unpin_lock(rq, &rf);
- rq = move_queued_task(rq, p, dest_cpu);
- rq_repin_lock(rq, &rf);
+ rq = move_queued_task(rq, &rf, p, dest_cpu);
}
out:
task_rq_unlock(rq, p, &rf);
@@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
+ struct rq_flags srf, drf;
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
+ rq_pin_lock(src_rq, &srf);
+ rq_pin_lock(dst_rq, &drf);
+
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
+
+ rq_unpin_lock(dst_rq, &drf);
+ rq_unpin_lock(src_rq, &srf);
+
} else {
/*
* Task isn't running anymore; make it appear like we migrated
@@ -1680,7 +1686,7 @@ static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
{
- int en_flags = ENQUEUE_WAKEUP;
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
lockdep_assert_held(&rq->lock);
@@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void)
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
- unsigned long flags;
struct rq_flags rf;
if (!llist)
return;
- raw_spin_lock_irqsave(&rq->lock, flags);
- rq_pin_lock(rq, &rf);
+ rq_lock_irqsave(rq, &rf);
+ update_rq_clock(rq);
while (llist) {
int wake_flags = 0;
@@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, wake_flags, &rf);
}
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
void scheduler_ipi(void)
@@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ struct rq_flags rf;
rcu_read_lock();
@@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu)
if (set_nr_if_polling(rq->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
} else {
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
if (is_idle_task(rq->curr))
smp_send_reschedule(cpu);
/* Else CPU is not idle, do nothing here: */
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
out:
@@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
}
#endif
- raw_spin_lock(&rq->lock);
- rq_pin_lock(rq, &rf);
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
ttwu_do_activate(rq, p, wake_flags, &rf);
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
* disabled avoiding further scheduler activity on it and we've
* not yet picked a replacement task.
*/
- rq_unpin_lock(rq, rf);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- rq_repin_lock(rq, rf);
+ rq_relock(rq, rf);
}
if (!(p->state & TASK_NORMAL))
@@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
delayacct_blkio_end();
atomic_dec(&rq->nr_iowait);
}
- ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
}
ttwu_do_wakeup(rq, p, 0, rf);
@@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p)
update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
- activate_task(rq, p, 0);
+ activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
@@ -3093,15 +3094,18 @@ void scheduler_tick(void)
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
+ struct rq_flags rf;
sched_clock_tick();
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
+
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
cpu_load_update_active(rq);
calc_global_load_tick(rq);
- raw_spin_unlock(&rq->lock);
+
+ rq_unlock(rq, &rf);
perf_event_task_tick();
@@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
- raw_spin_lock(&rq->lock);
- rq_pin_lock(rq, &rf);
+ rq_lock(rq, &rf);
/* Promote REQ to ACT */
rq->clock_update_flags <<= 1;
+ update_rq_clock(rq);
switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
- deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
prev->on_rq = 0;
if (prev->in_iowait) {
@@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt)
switch_count = &prev->nvcsw;
}
- if (task_on_rq_queued(prev))
- update_rq_clock(rq);
-
next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
@@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt)
rq = context_switch(rq, prev, next, &rf);
} else {
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
}
balance_callback(rq);
@@ -3671,10 +3671,25 @@ EXPORT_SYMBOL(default_wake_function);
#ifdef CONFIG_RT_MUTEXES
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+ if (pi_task)
+ prio = min(prio, pi_task->prio);
+
+ return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ return __rt_effective_prio(pi_task, prio);
+}
+
/*
* rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
+ * @p: task to boost
+ * @pi_task: donor task
*
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
@@ -3682,17 +3697,42 @@ EXPORT_SYMBOL(default_wake_function);
* Used by the rt_mutex code to implement priority inheritance
* logic. Call site only calls if the priority of the task changed.
*/
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ int prio, oldprio, queued, running, queue_flag =
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class;
struct rq_flags rf;
struct rq *rq;
- BUG_ON(prio > MAX_PRIO);
+ /* XXX used to be waiter->prio, not waiter->task->prio */
+ prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+ /*
+ * If nothing changed; bail early.
+ */
+ if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+ return;
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
+ /*
+ * Set under pi_lock && rq->lock, such that the value can be used under
+ * either lock.
+ *
+ * Note that there is loads of tricky to make this pointer cache work
+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+ * ensure a task is de-boosted (pi_task is set to NULL) before the
+ * task is allowed to run again (and can exit). This ensures the pointer
+ * points to a blocked task -- which guaratees the task is present.
+ */
+ p->pi_top_task = pi_task;
+
+ /*
+ * For FIFO/RR we only need to set prio, if that matches we're done.
+ */
+ if (prio == p->prio && !dl_prio(prio))
+ goto out_unlock;
/*
* Idle task boosting is a nono in general. There is one
@@ -3712,7 +3752,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
goto out_unlock;
}
- trace_sched_pi_setprio(p, prio);
+ trace_sched_pi_setprio(p, pi_task);
oldprio = p->prio;
if (oldprio == prio)
@@ -3736,7 +3776,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
* running task
*/
if (dl_prio(prio)) {
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
@@ -3774,6 +3813,11 @@ out_unlock:
balance_callback(rq);
preempt_enable();
}
+#else
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ return prio;
+}
#endif
void set_user_nice(struct task_struct *p, long nice)
@@ -3805,7 +3849,7 @@ void set_user_nice(struct task_struct *p, long nice)
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
if (running)
put_prev_task(rq, p);
@@ -3816,7 +3860,7 @@ void set_user_nice(struct task_struct *p, long nice)
delta = p->prio - old_prio;
if (queued) {
- enqueue_task(rq, p, ENQUEUE_RESTORE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -4020,10 +4064,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
* Keep a potential priority boosting if called from
* sched_setscheduler().
*/
+ p->prio = normal_prio(p);
if (keep_boost)
- p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
- else
- p->prio = normal_prio(p);
+ p->prio = rt_effective_prio(p, p->prio);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -4126,7 +4169,7 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_class *prev_class;
struct rq_flags rf;
int reset_on_fork;
- int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
/* May grab non-irq protected spin_locks: */
@@ -4310,7 +4353,7 @@ change:
* the runqueue. This will be done when the task deboost
* itself.
*/
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ new_effective_prio = rt_effective_prio(p, newprio);
if (new_effective_prio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
@@ -4923,7 +4966,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
*/
SYSCALL_DEFINE0(sched_yield)
{
- struct rq *rq = this_rq_lock();
+ struct rq_flags rf;
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ rq_lock(rq, &rf);
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
@@ -4932,9 +4980,8 @@ SYSCALL_DEFINE0(sched_yield)
* Since we are going to call schedule() anyway, there's
* no need to preempt or enable interrupts:
*/
- __release(rq->lock);
- spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
- do_raw_spin_unlock(&rq->lock);
+ preempt_disable();
+ rq_unlock(rq, &rf);
sched_preempt_enable_no_resched();
schedule();
@@ -5514,7 +5561,7 @@ void sched_setnuma(struct task_struct *p, int nid)
p->numa_preferred_nid = nid;
if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_curr_task(rq, p);
task_rq_unlock(rq, p, &rf);
@@ -5579,11 +5626,11 @@ static struct task_struct fake_task = {
* there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts.
*/
-static void migrate_tasks(struct rq *dead_rq)
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
- struct rq_flags rf;
+ struct rq_flags orf = *rf;
int dest_cpu;
/*
@@ -5602,9 +5649,7 @@ static void migrate_tasks(struct rq *dead_rq)
* class method both need to have an up-to-date
* value of rq->clock[_task]
*/
- rq_pin_lock(rq, &rf);
update_rq_clock(rq);
- rq_unpin_lock(rq, &rf);
for (;;) {
/*
@@ -5617,8 +5662,7 @@ static void migrate_tasks(struct rq *dead_rq)
/*
* pick_next_task() assumes pinned rq->lock:
*/
- rq_repin_lock(rq, &rf);
- next = pick_next_task(rq, &fake_task, &rf);
+ next = pick_next_task(rq, &fake_task, rf);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5631,10 +5675,9 @@ static void migrate_tasks(struct rq *dead_rq)
* because !cpu_active at this point, which means load-balance
* will not interfere. Also, stop-machine.
*/
- rq_unpin_lock(rq, &rf);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
raw_spin_lock(&next->pi_lock);
- raw_spin_lock(&rq->lock);
+ rq_relock(rq, rf);
/*
* Since we're inside stop-machine, _nothing_ should have
@@ -5648,12 +5691,12 @@ static void migrate_tasks(struct rq *dead_rq)
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-
- rq = __migrate_task(rq, next, dest_cpu);
+ rq = __migrate_task(rq, rf, next, dest_cpu);
if (rq != dead_rq) {
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, rf);
rq = dead_rq;
- raw_spin_lock(&rq->lock);
+ *rf = orf;
+ rq_relock(rq, rf);
}
raw_spin_unlock(&next->pi_lock);
}
@@ -5766,7 +5809,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
int sched_cpu_activate(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ struct rq_flags rf;
set_cpu_active(cpu, true);
@@ -5784,12 +5827,12 @@ int sched_cpu_activate(unsigned int cpu)
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains.
*/
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq);
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
update_max_interval();
@@ -5847,18 +5890,20 @@ int sched_cpu_starting(unsigned int cpu)
int sched_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ struct rq_flags rf;
/* Handle pending wakeups and then migrate everything off */
sched_ttwu_pending();
- raw_spin_lock_irqsave(&rq->lock, flags);
+
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
- migrate_tasks(rq);
+ migrate_tasks(rq, &rf);
BUG_ON(rq->nr_running != 1);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
+
calc_load_migrate(rq);
update_max_interval();
nohz_balance_exit_idle(cpu);
@@ -6412,7 +6457,8 @@ static void sched_change_group(struct task_struct *tsk, int type)
*/
void sched_move_task(struct task_struct *tsk)
{
- int queued, running;
+ int queued, running, queue_flags =
+ DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq_flags rf;
struct rq *rq;
@@ -6423,14 +6469,14 @@ void sched_move_task(struct task_struct *tsk)
queued = task_on_rq_queued(tsk);
if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ dequeue_task(rq, tsk, queue_flags);
if (running)
put_prev_task(rq, tsk);
sched_change_group(tsk, TASK_MOVE_GROUP);
if (queued)
- enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+ enqueue_task(rq, tsk, queue_flags);
if (running)
set_curr_task(rq, tsk);
@@ -7008,14 +7054,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
+ struct rq_flags rf;
- raw_spin_lock_irq(&rq->lock);
+ rq_lock_irq(rq, &rf);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
}
if (runtime_was_enabled && !runtime_enabled)
cfs_bandwidth_usage_dec();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3ae75f5a69fc..359dbc05a3b4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] = {
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
{
unsigned int local_n;
@@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u64 val, u64 n)
return val;
}
-/*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
- *
- * We can compute this reasonably efficiently by combining:
- * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
- */
-static u32 __compute_runnable_contrib(u64 n)
+static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
{
- u32 contrib = 0;
+ u32 c1, c2, c3 = remainder; /* y^0 == 1 */
+
+ if (!periods)
+ return remainder - period_contrib;
- if (likely(n <= LOAD_AVG_PERIOD))
- return runnable_avg_yN_sum[n];
- else if (unlikely(n >= LOAD_AVG_MAX_N))
+ if (unlikely(periods >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;
- /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
- contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
- n %= LOAD_AVG_PERIOD;
- contrib = decay_load(contrib, n);
- return contrib + runnable_avg_yN_sum[n];
+ /*
+ * c1 = d1 y^(p+1)
+ */
+ c1 = decay_load((u64)(1024 - period_contrib), periods);
+
+ periods -= 1;
+ /*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be:
+ *
+ * c2 = 1024 \Sum y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ *
+ * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
+ */
+ if (likely(periods <= LOAD_AVG_PERIOD)) {
+ c2 = runnable_avg_yN_sum[periods];
+ } else {
+ c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
+ periods %= LOAD_AVG_PERIOD;
+ c2 = decay_load(c2, periods);
+ c2 += runnable_avg_yN_sum[periods];
+ }
+
+ return c1 + c2 + c3;
}
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
/*
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ * d1 d2 d3
+ * ^ ^ ^
+ * | | |
+ * |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ * p
+ * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
+ * n=1
+ *
+ * = u y^(p+1) + (Step 1)
+ *
+ * p
+ * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2)
+ * n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+ unsigned long scale_freq, scale_cpu;
+ u64 periods;
+ u32 contrib;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ delta += sa->period_contrib;
+ periods = delta / 1024; /* A period is 1024us (~1ms) */
+
+ /*
+ * Step 1: decay old *_sum if we crossed period boundaries.
+ */
+ if (periods) {
+ sa->load_sum = decay_load(sa->load_sum, periods);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+ }
+
+ /*
+ * Step 2
+ */
+ delta %= 1024;
+ contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+
+ return periods;
+}
+
+/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
* history into segments of approximately 1ms (1024us); label the segment that
@@ -2849,13 +2930,10 @@ static u32 __compute_runnable_contrib(u64 n)
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u64 delta, scaled_delta, periods;
- u32 contrib;
- unsigned int delta_w, scaled_delta_w, decayed = 0;
- unsigned long scale_freq, scale_cpu;
+ u64 delta;
delta = now - sa->last_update_time;
/*
@@ -2876,81 +2954,49 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
return 0;
sa->last_update_time = now;
- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
- /* delta_w is the amount already accumulated against our next period */
- delta_w = sa->period_contrib;
- if (delta + delta_w >= 1024) {
- decayed = 1;
-
- /* how much left for next period will start over, we don't know yet */
- sa->period_contrib = 0;
-
- /*
- * Now that we know we're crossing a period boundary, figure
- * out how much from delta we need to complete the current
- * period and accrue it.
- */
- delta_w = 1024 - delta_w;
- scaled_delta_w = cap_scale(delta_w, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta_w;
- if (cfs_rq) {
- cfs_rq->runnable_load_sum +=
- weight * scaled_delta_w;
- }
- }
- if (running)
- sa->util_sum += scaled_delta_w * scale_cpu;
-
- delta -= delta_w;
-
- /* Figure out how many additional periods this update spans */
- periods = delta / 1024;
- delta %= 1024;
+ /*
+ * Now we know we crossed measurement unit boundaries. The *_avg
+ * accrues by two steps:
+ *
+ * Step 1: accumulate *_sum since last_update_time. If we haven't
+ * crossed period boundaries, finish.
+ */
+ if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+ return 0;
- sa->load_sum = decay_load(sa->load_sum, periods + 1);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods + 1);
- }
- sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
- /* Efficiently calculate \sum (1..n_period) 1024*y^i */
- contrib = __compute_runnable_contrib(periods);
- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
- if (running)
- sa->util_sum += contrib * scale_cpu;
+ /*
+ * Step 2: update *_avg.
+ */
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
}
+ sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
- /* Remainder of delta accrued against u_0` */
- scaled_delta = cap_scale(delta, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * scaled_delta;
- }
- if (running)
- sa->util_sum += scaled_delta * scale_cpu;
+ return 1;
+}
- sa->period_contrib += delta;
+static int
+__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+{
+ return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+}
- if (decayed) {
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
- }
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
- }
+static int
+__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ return ___update_load_avg(now, cpu, &se->avg,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
+}
- return decayed;
+static int
+__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+{
+ return ___update_load_avg(now, cpu, &cfs_rq->avg,
+ scale_load_down(cfs_rq->load.weight),
+ cfs_rq->curr != NULL, cfs_rq);
}
/*
@@ -3014,6 +3060,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next)
{
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
if (!sched_feat(ATTACH_AGE_LOAD))
return;
@@ -3024,11 +3073,11 @@ void set_task_rq_fair(struct sched_entity *se,
* time. This will result in the wakee task is less decayed, but giving
* the wakee more load sounds not bad.
*/
- if (se->avg.last_update_time && prev) {
- u64 p_last_update_time;
- u64 n_last_update_time;
+ if (!(se->avg.last_update_time && prev))
+ return;
#ifndef CONFIG_64BIT
+ {
u64 p_last_update_time_copy;
u64 n_last_update_time_copy;
@@ -3043,14 +3092,13 @@ void set_task_rq_fair(struct sched_entity *se,
} while (p_last_update_time != p_last_update_time_copy ||
n_last_update_time != n_last_update_time_copy);
+ }
#else
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
#endif
- __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
- &se->avg, 0, 0, NULL);
- se->avg.last_update_time = n_last_update_time;
- }
+ __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
+ se->avg.last_update_time = n_last_update_time;
}
/* Take into account change of utilization of a child task group */
@@ -3173,6 +3221,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
return 1;
}
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+ /*
+ * If sched_entity still have not zero load or utilization, we have to
+ * decay it:
+ */
+ if (se->avg.load_avg || se->avg.util_avg)
+ return false;
+
+ /*
+ * If there is a pending propagation, we have to update the load and
+ * the utilization of the sched_entity:
+ */
+ if (gcfs_rq->propagate_avg)
+ return false;
+
+ /*
+ * Otherwise, the load and the utilization of the sched_entity is
+ * already zero and there is no pending propagation, so it will be a
+ * waste of time to try to decay it:
+ */
+ return true;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -3265,8 +3343,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
set_tg_cfs_propagate(cfs_rq);
}
- decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
+ decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -3298,11 +3375,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
- __update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+ __update_load_avg_se(now, cpu, cfs_rq, se);
decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
decayed |= propagate_entity_load_avg(se);
@@ -3407,7 +3481,7 @@ void sync_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
last_update_time = cfs_rq_last_update_time(cfs_rq);
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
}
/*
@@ -4271,8 +4345,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
struct rq *rq = rq_of(cfs_rq);
+ struct rq_flags rf;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
@@ -4289,7 +4364,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
unthrottle_cfs_rq(cfs_rq);
next:
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
if (!remaining)
break;
@@ -5097,15 +5172,16 @@ void cpu_load_update_nohz_stop(void)
unsigned long curr_jiffies = READ_ONCE(jiffies);
struct rq *this_rq = this_rq();
unsigned long load;
+ struct rq_flags rf;
if (curr_jiffies == this_rq->last_load_update_tick)
return;
load = weighted_cpuload(cpu_of(this_rq));
- raw_spin_lock(&this_rq->lock);
+ rq_lock(this_rq, &rf);
update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
- raw_spin_unlock(&this_rq->lock);
+ rq_unlock(this_rq, &rf);
}
#else /* !CONFIG_NO_HZ_COMMON */
static inline void cpu_load_update_nohz(struct rq *this_rq,
@@ -6769,7 +6845,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
lockdep_assert_held(&env->src_rq->lock);
p->on_rq = TASK_ON_RQ_MIGRATING;
- deactivate_task(env->src_rq, p, 0);
+ deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -6902,7 +6978,7 @@ static void attach_task(struct rq *rq, struct task_struct *p)
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
- activate_task(rq, p, 0);
+ activate_task(rq, p, ENQUEUE_NOCLOCK);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(rq, p, 0);
}
@@ -6913,9 +6989,12 @@ static void attach_task(struct rq *rq, struct task_struct *p)
*/
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
- raw_spin_lock(&rq->lock);
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
attach_task(rq, p);
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -6926,8 +7005,10 @@ static void attach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->tasks;
struct task_struct *p;
+ struct rq_flags rf;
- raw_spin_lock(&env->dst_rq->lock);
+ rq_lock(env->dst_rq, &rf);
+ update_rq_clock(env->dst_rq);
while (!list_empty(tasks)) {
p = list_first_entry(tasks, struct task_struct, se.group_node);
@@ -6936,7 +7017,7 @@ static void attach_tasks(struct lb_env *env)
attach_task(env->dst_rq, p);
}
- raw_spin_unlock(&env->dst_rq->lock);
+ rq_unlock(env->dst_rq, &rf);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6944,9 +7025,9 @@ static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq;
- unsigned long flags;
+ struct rq_flags rf;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
/*
@@ -6954,6 +7035,8 @@ static void update_blocked_averages(int cpu)
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct sched_entity *se;
+
/* throttled entities do not contribute to load */
if (throttled_hierarchy(cfs_rq))
continue;
@@ -6961,11 +7044,12 @@ static void update_blocked_averages(int cpu)
if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
- /* Propagate pending load changes to the parent */
- if (cfs_rq->tg->se[cpu])
- update_load_avg(cfs_rq->tg->se[cpu], 0);
+ /* Propagate pending load changes to the parent, if any: */
+ se = cfs_rq->tg->se[cpu];
+ if (se && !skip_blocked_update(se))
+ update_load_avg(se, 0);
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
/*
@@ -7019,12 +7103,12 @@ static inline void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq = &rq->cfs;
- unsigned long flags;
+ struct rq_flags rf;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
static unsigned long task_h_load(struct task_struct *p)
@@ -7525,6 +7609,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
+ struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
bool overload = false;
@@ -7541,7 +7626,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
if (local_group) {
sds->local = sg;
- sgs = &sds->local_stat;
+ sgs = local;
if (env->idle != CPU_NEWLY_IDLE ||
time_after_eq(jiffies, sg->sgc->next_update))
@@ -7565,8 +7650,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
* the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- group_has_capacity(env, &sds->local_stat) &&
- (sgs->sum_nr_running > 1)) {
+ group_has_capacity(env, local) &&
+ (sgs->sum_nr_running > local->sum_nr_running + 1)) {
sgs->group_no_capacity = 1;
sgs->group_type = group_classify(sg, sgs);
}
@@ -8042,7 +8127,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
struct rq *busiest;
- unsigned long flags;
+ struct rq_flags rf;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
struct lb_env env = {
@@ -8105,7 +8190,7 @@ redo:
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
- raw_spin_lock_irqsave(&busiest->lock, flags);
+ rq_lock_irqsave(busiest, &rf);
update_rq_clock(busiest);
/*
@@ -8122,14 +8207,14 @@ more_balance:
* See task_rq_lock() family for the details.
*/
- raw_spin_unlock(&busiest->lock);
+ rq_unlock(busiest, &rf);
if (cur_ld_moved) {
attach_tasks(&env);
ld_moved += cur_ld_moved;
}
- local_irq_restore(flags);
+ local_irq_restore(rf.flags);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
@@ -8207,6 +8292,8 @@ more_balance:
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
+ unsigned long flags;
+
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -8444,8 +8531,9 @@ static int active_load_balance_cpu_stop(void *data)
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
struct task_struct *p = NULL;
+ struct rq_flags rf;
- raw_spin_lock_irq(&busiest_rq->lock);
+ rq_lock_irq(busiest_rq, &rf);
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -8496,7 +8584,7 @@ static int active_load_balance_cpu_stop(void *data)
rcu_read_unlock();
out_unlock:
busiest_rq->active_balance = 0;
- raw_spin_unlock(&busiest_rq->lock);
+ rq_unlock(busiest_rq, &rf);
if (p)
attach_one_task(target_rq, p);
@@ -8794,10 +8882,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
* do the balance.
*/
if (time_after_eq(jiffies, rq->next_balance)) {
- raw_spin_lock_irq(&rq->lock);
+ struct rq_flags rf;
+
+ rq_lock_irq(rq, &rf);
update_rq_clock(rq);
cpu_load_update_idle(rq);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
+
rebalance_domains(rq, CPU_IDLE);
}
@@ -8988,8 +9079,9 @@ static void task_fork_fair(struct task_struct *p)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
struct rq *rq = this_rq();
+ struct rq_flags rf;
- raw_spin_lock(&rq->lock);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
@@ -9010,7 +9102,7 @@ static void task_fork_fair(struct task_struct *p)
}
se->vruntime -= cfs_rq->min_vruntime;
- raw_spin_unlock(&rq->lock);
+ rq_unlock(rq, &rf);
}
/*
@@ -9372,7 +9464,6 @@ static DEFINE_MUTEX(shares_mutex);
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
- unsigned long flags;
/*
* We can't change the weight of the root cgroup.
@@ -9389,19 +9480,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
tg->shares = shares;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
- struct sched_entity *se;
+ struct sched_entity *se = tg->se[i];
+ struct rq_flags rf;
- se = tg->se[i];
/* Propagate contribution to hierarchy */
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- /* Possible calls to update_curr() need rq clock */
+ rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
for_each_sched_entity(se) {
update_load_avg(se, UPDATE_TG);
update_cfs_shares(se);
}
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ rq_unlock_irqrestore(rq, &rf);
}
done:
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1b3c8189b286..11192e0cb122 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true)
*/
SCHED_FEAT(SIS_AVG_CPU, false)
+/*
+ * Issue a WARN when we do multiple update_rq_clock() calls
+ * in a single rq->lock section. Default disabled because the
+ * annotations are not complete.
+ */
+SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
+
#ifdef HAVE_RT_PUSH_IPI
/*
* In order to avoid a thundering herd attack of CPUs that are
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9f3e40226dec..979b7341008a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq)
#define RT_PUSH_IPI_EXECUTING 1
#define RT_PUSH_IPI_RESTART 2
+/*
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * On large CPU boxes, there's the case that several CPUs could schedule
+ * a lower priority task at the same time, in which case it will look for
+ * any overloaded CPUs that it could pull a task from. To do this, the runqueue
+ * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
+ * for a single overloaded CPU's runqueue lock can produce a large latency.
+ * (This has actually been observed on large boxes running cyclictest).
+ * Instead of taking the runqueue lock of the overloaded CPU, each of the
+ * CPUs that scheduled a lower priority task simply sends an IPI to the
+ * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
+ * lots of contention. The overloaded CPU will look to push its non-running
+ * RT task off, and if it does, it can then ignore the other IPIs coming
+ * in, and just pass those IPIs off to any other overloaded CPU.
+ *
+ * When a CPU schedules a lower priority task, it only sends an IPI to
+ * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
+ * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
+ * RT overloaded tasks, would cause 100 IPIs to go out at once.
+ *
+ * The overloaded RT CPU, when receiving an IPI, will try to push off its
+ * overloaded RT tasks and then send an IPI to the next CPU that has
+ * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
+ * have completed. Just because a CPU may have pushed off its own overloaded
+ * RT task does not mean it should stop sending the IPI around to other
+ * overloaded CPUs. There may be another RT task waiting to run on one of
+ * those CPUs that are of higher priority than the one that was just
+ * pushed.
+ *
+ * An optimization that could possibly be made is to make a CPU array similar
+ * to the cpupri array mask of all running RT tasks, but for the overloaded
+ * case, then the IPI could be sent to only the CPU with the highest priority
+ * RT task waiting, and that CPU could send off further IPIs to the CPU with
+ * the next highest waiting task. Since the overloaded case is much less likely
+ * to happen, the complexity of this implementation may not be worth it.
+ * Instead, just send an IPI around to all overloaded CPUs.
+ *
+ * The rq->rt.push_flags holds the status of the IPI that is going around.
+ * A run queue can only send out a single IPI at a time. The possible flags
+ * for rq->rt.push_flags are:
+ *
+ * (None or zero): No IPI is going around for the current rq
+ * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
+ * RT_PUSH_IPI_RESTART: The priority of the running task for the rq
+ * has changed, and the IPI should restart
+ * circulating the overloaded CPUs again.
+ *
+ * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
+ * before sending to the next CPU.
+ *
+ * Instead of having all CPUs that schedule a lower priority task send
+ * an IPI to the same "first" CPU in the RT overload mask, they send it
+ * to the next overloaded CPU after their own CPU. This helps distribute
+ * the work when there's more than one overloaded CPU and multiple CPUs
+ * scheduling in lower priority tasks.
+ *
+ * When a rq schedules a lower priority task than what was currently
+ * running, the next CPU with overloaded RT tasks is examined first.
+ * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
+ * priority task, it will send an IPI first to CPU 5, then CPU 5 will
+ * send to CPU 1 if it is still overloaded. CPU 1 will clear the
+ * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
+ *
+ * The first CPU to notice IPI_RESTART is set, will clear that flag and then
+ * send an IPI to the next overloaded CPU after the rq->cpu and not the next
+ * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
+ * schedules a lower priority task, and the IPI_RESTART gets set while the
+ * handling is being done on CPU 5, it will clear the flag and send it back to
+ * CPU 4 instead of CPU 1.
+ *
+ * Note, the above logic can be disabled by turning off the sched_feature
+ * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
+ * taken by the CPU requesting a pull and the waiting RT task will be pulled
+ * by that CPU. This may be fine for machines with few CPUs.
+ */
static void tell_cpu_to_push(struct rq *rq)
{
int cpu;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf92214ad8..de4b934ba974 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40];
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
#define ENQUEUE_MOVE 0x04
+#define ENQUEUE_NOCLOCK 0x08
-#define ENQUEUE_HEAD 0x08
-#define ENQUEUE_REPLENISH 0x10
+#define ENQUEUE_HEAD 0x10
+#define ENQUEUE_REPLENISH 0x20
#ifdef CONFIG_SMP
-#define ENQUEUE_MIGRATED 0x20
+#define ENQUEUE_MIGRATED 0x40
#else
#define ENQUEUE_MIGRATED 0x00
#endif
@@ -1624,6 +1626,7 @@ static inline void sched_avg_update(struct rq *rq) { }
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock);
+
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock);
@@ -1645,6 +1648,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
+static inline void
+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irqsave(&rq->lock, rf->flags);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irq(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_relock(struct rq *rq, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ raw_spin_lock(&rq->lock);
+ rq_repin_lock(rq, rf);
+}
+
+static inline void
+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+}
+
+static inline void
+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void
+rq_unlock(struct rq *rq, struct rq_flags *rf)
+ __releases(rq->lock)
+{
+ rq_unpin_lock(rq, rf);
+ raw_spin_unlock(&rq->lock);
+}
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 97ac0951f164..4237e0744e26 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev)
}
EXPORT_SYMBOL_GPL(clockevents_register_device);
-void clockevents_config(struct clock_event_device *dev, u32 freq)
+static void clockevents_config(struct clock_event_device *dev, u32 freq)
{
u64 sec;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index a7098f0737ed..0e13f211b714 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1368,10 +1368,7 @@ retry:
ktime_to_ns(delta));
}
-/*
- * local version of hrtimer_peek_ahead_timers() called with interrupts
- * disabled.
- */
+/* called with interrupts disabled */
static inline void __hrtimer_peek_ahead_timers(void)
{
struct tick_device *td;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 4513ad16a253..76bea3a47d4b 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -825,6 +825,8 @@ static void check_thread_timers(struct task_struct *tsk,
* At the hard limit, we just die.
* No need to calculate anything else now.
*/
+ pr_info("CPU Watchdog Timeout (hard): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
@@ -836,8 +838,7 @@ static void check_thread_timers(struct task_struct *tsk,
soft += USEC_PER_SEC;
sig->rlim[RLIMIT_RTTIME].rlim_cur = soft;
}
- printk(KERN_INFO
- "RT Watchdog Timeout: %s[%d]\n",
+ pr_info("RT Watchdog Timeout (soft): %s[%d]\n",
tsk->comm, task_pid_nr(tsk));
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
}
@@ -935,6 +936,8 @@ static void check_process_timers(struct task_struct *tsk,
* At the hard limit, we just die.
* No need to calculate anything else now.
*/
+ pr_info("RT Watchdog Timeout (hard): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
return;
}
@@ -942,6 +945,8 @@ static void check_process_timers(struct task_struct *tsk,
/*
* At the soft limit, send a SIGXCPU every second.
*/
+ pr_info("CPU Watchdog Timeout (soft): %s[%d]\n",
+ tsk->comm, task_pid_nr(tsk));
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
if (soft < hard) {
soft++;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index ea6b610c4c57..2d8f05aad442 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -206,6 +206,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
update_clock_read_data(&rd);
+ if (sched_clock_timer.function != NULL) {
+ /* update timeout for clock wrap */
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ }
+
r = rate;
if (r >= 4000000) {
r /= 1000000;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5b63a2102c29..9652bc57fd09 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -996,8 +996,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history,
return 0;
/* Interpolate shortest distance from beginning or end of history */
- interp_forward = partial_history_cycles > total_history_cycles/2 ?
- true : false;
+ interp_forward = partial_history_cycles > total_history_cycles / 2;
partial_history_cycles = interp_forward ?
total_history_cycles - partial_history_cycles :
partial_history_cycles;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index ff8d5c13d04b..0e7f5428a148 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -16,6 +16,7 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
+#include <linux/nmi.h>
#include <linux/uaccess.h>
@@ -86,6 +87,9 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
next_one:
i = 0;
+
+ touch_nmi_watchdog();
+
raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
curr = timerqueue_getnext(&base->active);
@@ -197,6 +201,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
{
struct clock_event_device *dev = td->evtdev;
+ touch_nmi_watchdog();
+
SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
if (cpu < 0)
SEQ_printf(m, "Broadcast device\n");
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d4a06e714645..9619b5768e4b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -455,7 +455,7 @@ config UPROBE_EVENTS
select UPROBES
select PROBE_EVENTS
select TRACING
- default n
+ default y
help
This allows the user to add tracing events on top of userspace
dynamic events (similar to tracepoints) on the fly via the trace
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f35109514a01..0ed834d6beb0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4355,6 +4355,7 @@ static const char readme_msg[] =
"\t -:[<group>/]<event>\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
+ "place (kretprobe): [<module>:]<symbol>[+<offset>]|<memaddr>\n"
#endif
#ifdef CONFIG_UPROBE_EVENTS
"\t place: <path>:<offset>\n"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5f688cc724f0..013f4e7146d4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -681,10 +681,6 @@ static int create_trace_kprobe(int argc, char **argv)
return -EINVAL;
}
if (isdigit(argv[1][0])) {
- if (is_return) {
- pr_info("Return probe point must be a symbol.\n");
- return -EINVAL;
- }
/* an address specified */
ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
if (ret) {
@@ -700,8 +696,9 @@ static int create_trace_kprobe(int argc, char **argv)
pr_info("Failed to parse symbol.\n");
return ret;
}
- if (offset && is_return) {
- pr_info("Return probe must be used without offset.\n");
+ if (offset && is_return &&
+ !function_offset_within_entry(NULL, symbol, offset)) {
+ pr_info("Given offset is not valid for return probe.\n");
return -EINVAL;
}
}
diff --git a/lib/bug.c b/lib/bug.c
index 06edbbef0623..a6a1137d06db 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -47,7 +47,7 @@
#include <linux/sched.h>
#include <linux/rculist.h>
-extern const struct bug_entry __start___bug_table[], __stop___bug_table[];
+extern struct bug_entry __start___bug_table[], __stop___bug_table[];
static inline unsigned long bug_addr(const struct bug_entry *bug)
{
@@ -62,10 +62,10 @@ static inline unsigned long bug_addr(const struct bug_entry *bug)
/* Updates are protected by module mutex */
static LIST_HEAD(module_bug_list);
-static const struct bug_entry *module_find_bug(unsigned long bugaddr)
+static struct bug_entry *module_find_bug(unsigned long bugaddr)
{
struct module *mod;
- const struct bug_entry *bug = NULL;
+ struct bug_entry *bug = NULL;
rcu_read_lock_sched();
list_for_each_entry_rcu(mod, &module_bug_list, bug_list) {
@@ -122,15 +122,15 @@ void module_bug_cleanup(struct module *mod)
#else
-static inline const struct bug_entry *module_find_bug(unsigned long bugaddr)
+static inline struct bug_entry *module_find_bug(unsigned long bugaddr)
{
return NULL;
}
#endif
-const struct bug_entry *find_bug(unsigned long bugaddr)
+struct bug_entry *find_bug(unsigned long bugaddr)
{
- const struct bug_entry *bug;
+ struct bug_entry *bug;
for (bug = __start___bug_table; bug < __stop___bug_table; ++bug)
if (bugaddr == bug_addr(bug))
@@ -141,9 +141,9 @@ const struct bug_entry *find_bug(unsigned long bugaddr)
enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
{
- const struct bug_entry *bug;
+ struct bug_entry *bug;
const char *file;
- unsigned line, warning;
+ unsigned line, warning, once, done;
if (!is_valid_bugaddr(bugaddr))
return BUG_TRAP_TYPE_NONE;
@@ -164,6 +164,18 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
line = bug->line;
#endif
warning = (bug->flags & BUGFLAG_WARNING) != 0;
+ once = (bug->flags & BUGFLAG_ONCE) != 0;
+ done = (bug->flags & BUGFLAG_DONE) != 0;
+
+ if (warning && once) {
+ if (done)
+ return BUG_TRAP_TYPE_WARN;
+
+ /*
+ * Since this is the only store, concurrency is not an issue.
+ */
+ bug->flags |= BUGFLAG_DONE;
+ }
}
if (warning) {
diff --git a/lib/refcount.c b/lib/refcount.c
index aa09ad3c30b0..f42124ccf295 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -37,11 +37,29 @@
#include <linux/refcount.h>
#include <linux/bug.h>
+/**
+ * refcount_add_not_zero - add a value to a refcount unless it is 0
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time. In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ *
+ * Return: false if the passed refcount is 0, true otherwise
+ */
bool refcount_add_not_zero(unsigned int i, refcount_t *r)
{
- unsigned int old, new, val = atomic_read(&r->refs);
+ unsigned int new, val = atomic_read(&r->refs);
- for (;;) {
+ do {
if (!val)
return false;
@@ -51,12 +69,8 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r)
new = val + i;
if (new < val)
new = UINT_MAX;
- old = atomic_cmpxchg_relaxed(&r->refs, val, new);
- if (old == val)
- break;
- val = old;
- }
+ } while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
@@ -64,24 +78,45 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r)
}
EXPORT_SYMBOL_GPL(refcount_add_not_zero);
+/**
+ * refcount_add - add a value to a refcount
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_add(), but will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time. In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ */
void refcount_add(unsigned int i, refcount_t *r)
{
WARN_ONCE(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
}
EXPORT_SYMBOL_GPL(refcount_add);
-/*
- * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+/**
+ * refcount_inc_not_zero - increment a refcount unless it is 0
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc_not_zero(), but will saturate at UINT_MAX and WARN.
*
* Provides no memory ordering, it is assumed the caller has guaranteed the
* object memory to be stable (RCU, etc.). It does provide a control dependency
* and thereby orders future stores. See the comment on top.
+ *
+ * Return: true if the increment was successful, false otherwise
*/
bool refcount_inc_not_zero(refcount_t *r)
{
- unsigned int old, new, val = atomic_read(&r->refs);
+ unsigned int new, val = atomic_read(&r->refs);
- for (;;) {
+ do {
new = val + 1;
if (!val)
@@ -90,12 +125,7 @@ bool refcount_inc_not_zero(refcount_t *r)
if (unlikely(!new))
return true;
- old = atomic_cmpxchg_relaxed(&r->refs, val, new);
- if (old == val)
- break;
-
- val = old;
- }
+ } while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new));
WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
@@ -103,11 +133,17 @@ bool refcount_inc_not_zero(refcount_t *r)
}
EXPORT_SYMBOL_GPL(refcount_inc_not_zero);
-/*
- * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+/**
+ * refcount_inc - increment a refcount
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc(), but will saturate at UINT_MAX and WARN.
*
* Provides no memory ordering, it is assumed the caller already has a
- * reference on the object, will WARN when this is not so.
+ * reference on the object.
+ *
+ * Will WARN if the refcount is 0, as this represents a possible use-after-free
+ * condition.
*/
void refcount_inc(refcount_t *r)
{
@@ -115,11 +151,31 @@ void refcount_inc(refcount_t *r)
}
EXPORT_SYMBOL_GPL(refcount_inc);
+/**
+ * refcount_sub_and_test - subtract from a refcount and test if it is 0
+ * @i: amount to subtract from the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_dec_and_test(), but it will WARN, return false and
+ * ultimately leak on underflow and will fail to decrement when saturated
+ * at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time. In these
+ * cases, refcount_dec(), or one of its variants, should instead be used to
+ * decrement a reference count.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
+ */
bool refcount_sub_and_test(unsigned int i, refcount_t *r)
{
- unsigned int old, new, val = atomic_read(&r->refs);
+ unsigned int new, val = atomic_read(&r->refs);
- for (;;) {
+ do {
if (unlikely(val == UINT_MAX))
return false;
@@ -129,24 +185,24 @@ bool refcount_sub_and_test(unsigned int i, refcount_t *r)
return false;
}
- old = atomic_cmpxchg_release(&r->refs, val, new);
- if (old == val)
- break;
-
- val = old;
- }
+ } while (!atomic_try_cmpxchg_release(&r->refs, &val, new));
return !new;
}
EXPORT_SYMBOL_GPL(refcount_sub_and_test);
-/*
+/**
+ * refcount_dec_and_test - decrement a refcount and test if it is 0
+ * @r: the refcount
+ *
* Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
* decrement when saturated at UINT_MAX.
*
* Provides release memory ordering, such that prior loads and stores are done
* before, and provides a control dependency such that free() must come after.
* See the comment on top.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
*/
bool refcount_dec_and_test(refcount_t *r)
{
@@ -154,21 +210,26 @@ bool refcount_dec_and_test(refcount_t *r)
}
EXPORT_SYMBOL_GPL(refcount_dec_and_test);
-/*
+/**
+ * refcount_dec - decrement a refcount
+ * @r: the refcount
+ *
* Similar to atomic_dec(), it will WARN on underflow and fail to decrement
* when saturated at UINT_MAX.
*
* Provides release memory ordering, such that prior loads and stores are done
* before.
*/
-
void refcount_dec(refcount_t *r)
{
WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
}
EXPORT_SYMBOL_GPL(refcount_dec);
-/*
+/**
+ * refcount_dec_if_one - decrement a refcount if it is 1
+ * @r: the refcount
+ *
* No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
* success thereof.
*
@@ -178,24 +239,33 @@ EXPORT_SYMBOL_GPL(refcount_dec);
* It can be used like a try-delete operator; this explicit case is provided
* and not cmpxchg in generic, because that would allow implementing unsafe
* operations.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
*/
bool refcount_dec_if_one(refcount_t *r)
{
- return atomic_cmpxchg_release(&r->refs, 1, 0) == 1;
+ int val = 1;
+
+ return atomic_try_cmpxchg_release(&r->refs, &val, 0);
}
EXPORT_SYMBOL_GPL(refcount_dec_if_one);
-/*
+/**
+ * refcount_dec_not_one - decrement a refcount if it is not 1
+ * @r: the refcount
+ *
* No atomic_t counterpart, it decrements unless the value is 1, in which case
* it will return false.
*
* Was often done like: atomic_add_unless(&var, -1, 1)
+ *
+ * Return: true if the decrement operation was successful, false otherwise
*/
bool refcount_dec_not_one(refcount_t *r)
{
- unsigned int old, new, val = atomic_read(&r->refs);
+ unsigned int new, val = atomic_read(&r->refs);
- for (;;) {
+ do {
if (unlikely(val == UINT_MAX))
return true;
@@ -208,24 +278,27 @@ bool refcount_dec_not_one(refcount_t *r)
return true;
}
- old = atomic_cmpxchg_release(&r->refs, val, new);
- if (old == val)
- break;
-
- val = old;
- }
+ } while (!atomic_try_cmpxchg_release(&r->refs, &val, new));
return true;
}
EXPORT_SYMBOL_GPL(refcount_dec_not_one);
-/*
+/**
+ * refcount_dec_and_mutex_lock - return holding mutex if able to decrement
+ * refcount to 0
+ * @r: the refcount
+ * @lock: the mutex to be locked
+ *
* Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
* to decrement when saturated at UINT_MAX.
*
* Provides release memory ordering, such that prior loads and stores are done
* before, and provides a control dependency such that free() must come after.
* See the comment on top.
+ *
+ * Return: true and hold mutex if able to decrement refcount to 0, false
+ * otherwise
*/
bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
{
@@ -242,13 +315,21 @@ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
}
EXPORT_SYMBOL_GPL(refcount_dec_and_mutex_lock);
-/*
+/**
+ * refcount_dec_and_lock - return holding spinlock if able to decrement
+ * refcount to 0
+ * @r: the refcount
+ * @lock: the spinlock to be locked
+ *
* Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
* decrement when saturated at UINT_MAX.
*
* Provides release memory ordering, such that prior loads and stores are done
* before, and provides a control dependency such that free() must come after.
* See the comment on top.
+ *
+ * Return: true and hold spinlock if able to decrement refcount to 0, false
+ * otherwise
*/
bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
{
diff --git a/mm/Kconfig b/mm/Kconfig
index 9b8fccb969dc..c89f472b658c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP
config HAVE_MEMBLOCK_PHYS_MAP
bool
-config HAVE_GENERIC_RCU_GUP
+config HAVE_GENERIC_GUP
bool
config ARCH_DISCARD_MEMBLOCK
diff --git a/mm/gup.c b/mm/gup.c
index 04aa405350dc..2559a3987de7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1155,7 +1155,7 @@ struct page *get_dump_page(unsigned long addr)
#endif /* CONFIG_ELF_CORE */
/*
- * Generic RCU Fast GUP
+ * Generic Fast GUP
*
* get_user_pages_fast attempts to pin user pages by walking the page
* tables directly and avoids taking locks. Thus the walker needs to be
@@ -1176,8 +1176,8 @@ struct page *get_dump_page(unsigned long addr)
* Before activating this code, please be aware that the following assumptions
* are currently made:
*
- * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
- * pages containing page tables.
+ * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
+ * free pages containing page tables or TLB flushing requires IPI broadcast.
*
* *) ptes can be read atomically by the architecture.
*
@@ -1187,36 +1187,59 @@ struct page *get_dump_page(unsigned long addr)
*
* This code is based heavily on the PowerPC implementation by Nick Piggin.
*/
-#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+#ifdef CONFIG_HAVE_GENERIC_GUP
+
+#ifndef gup_get_pte
+/*
+ * We assume that the PTE can be read atomically. If this is not the case for
+ * your architecture, please provide the helper.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ return READ_ONCE(*ptep);
+}
+#endif
+
+static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+{
+ while ((*nr) - nr_start) {
+ struct page *page = pages[--(*nr)];
+
+ ClearPageReferenced(page);
+ put_page(page);
+ }
+}
#ifdef __HAVE_ARCH_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
+ struct dev_pagemap *pgmap = NULL;
+ int nr_start = *nr, ret = 0;
pte_t *ptep, *ptem;
- int ret = 0;
ptem = ptep = pte_offset_map(&pmd, addr);
do {
- /*
- * In the line below we are assuming that the pte can be read
- * atomically. If this is not the case for your architecture,
- * please wrap this in a helper function!
- *
- * for an example see gup_get_pte in arch/x86/mm/gup.c
- */
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = gup_get_pte(ptep);
struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
* path using the pte_protnone check.
*/
- if (!pte_present(pte) || pte_special(pte) ||
- pte_protnone(pte) || (write && !pte_write(pte)))
+ if (pte_protnone(pte))
goto pte_unmap;
- if (!arch_pte_access_permitted(pte, write))
+ if (!pte_access_permitted(pte, write))
+ goto pte_unmap;
+
+ if (pte_devmap(pte)) {
+ pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ goto pte_unmap;
+ }
+ } else if (pte_special(pte))
goto pte_unmap;
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1232,6 +1255,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
VM_BUG_ON_PAGE(compound_head(page) != head, page);
+
+ put_dev_pagemap(pgmap);
+ SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -1261,15 +1287,76 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static int __gup_device_huge(unsigned long pfn, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ int nr_start = *nr;
+ struct dev_pagemap *pgmap = NULL;
+
+ do {
+ struct page *page = pfn_to_page(pfn);
+
+ pgmap = get_dev_pagemap(pfn, pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ return 0;
+ }
+ SetPageReferenced(page);
+ pages[*nr] = page;
+ get_page(page);
+ put_dev_pagemap(pgmap);
+ (*nr)++;
+ pfn++;
+ } while (addr += PAGE_SIZE, addr != end);
+ return 1;
+}
+
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ unsigned long fault_pfn;
+
+ fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ unsigned long fault_pfn;
+
+ fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+#else
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ BUILD_BUG();
+ return 0;
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ BUILD_BUG();
+ return 0;
+}
+#endif
+
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (write && !pmd_write(orig))
+ if (!pmd_access_permitted(orig, write))
return 0;
+ if (pmd_devmap(orig))
+ return __gup_device_huge_pmd(orig, addr, end, pages, nr);
+
refs = 0;
head = pmd_page(orig);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1293,6 +1380,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1302,9 +1390,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
struct page *head, *page;
int refs;
- if (write && !pud_write(orig))
+ if (!pud_access_permitted(orig, write))
return 0;
+ if (pud_devmap(orig))
+ return __gup_device_huge_pud(orig, addr, end, pages, nr);
+
refs = 0;
head = pud_page(orig);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1328,6 +1419,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1338,9 +1430,10 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
int refs;
struct page *head, *page;
- if (write && !pgd_write(orig))
+ if (!pgd_access_permitted(orig, write))
return 0;
+ BUILD_BUG_ON(pgd_devmap(orig));
refs = 0;
head = pgd_page(orig);
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
@@ -1364,6 +1457,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1520,6 +1614,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
return nr;
}
+#ifndef gup_fast_permitted
+/*
+ * Check if it's allowed to use __get_user_pages_fast() for the range, or
+ * we need to fall back to the slow version:
+ */
+bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
+{
+ unsigned long len, end;
+
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ return end >= start;
+}
+#endif
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
@@ -1539,11 +1648,14 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- int nr, ret;
+ int nr = 0, ret = 0;
start &= PAGE_MASK;
- nr = __get_user_pages_fast(start, nr_pages, write, pages);
- ret = nr;
+
+ if (gup_fast_permitted(start, nr_pages, write)) {
+ nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ ret = nr;
+ }
if (nr < nr_pages) {
/* Try to get the remaining pages with get_user_pages */
@@ -1565,4 +1677,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
return ret;
}
-#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
+#endif /* CONFIG_HAVE_GENERIC_GUP */
diff --git a/mm/percpu.c b/mm/percpu.c
index 60a6488e9e6d..e0aa8ae7bde7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1284,18 +1284,7 @@ void free_percpu(void __percpu *ptr)
}
EXPORT_SYMBOL_GPL(free_percpu);
-/**
- * is_kernel_percpu_address - test whether address is from static percpu area
- * @addr: address to test
- *
- * Test whether @addr belongs to in-kernel static percpu area. Module
- * static percpu areas are not considered. For those, use
- * is_module_percpu_address().
- *
- * RETURNS:
- * %true if @addr is from in-kernel static percpu area, %false otherwise.
- */
-bool is_kernel_percpu_address(unsigned long addr)
+bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
const size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -1304,16 +1293,39 @@ bool is_kernel_percpu_address(unsigned long addr)
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(base, cpu);
+ void *va = (void *)addr;
- if ((void *)addr >= start && (void *)addr < start + static_size)
+ if (va >= start && va < start + static_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(base, get_boot_cpu_id());
+ }
return true;
- }
+ }
+ }
#endif
/* on UP, can't distinguish from other static vars, always false */
return false;
}
/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area. Module
+ * static percpu areas are not considered. For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+ return __is_kernel_percpu_address(addr, NULL);
+}
+
+/**
* per_cpu_ptr_to_phys - convert translated percpu address to physical address
* @addr: the address to be converted to physical address
*
diff --git a/scripts/checksyscalls.sh b/scripts/checksyscalls.sh
index 2c9082ba6137..116b7735ee9f 100755
--- a/scripts/checksyscalls.sh
+++ b/scripts/checksyscalls.sh
@@ -148,6 +148,7 @@ cat << EOF
#define __IGNORE_sysfs
#define __IGNORE_uselib
#define __IGNORE__sysctl
+#define __IGNORE_arch_prctl
/* ... including the "new" 32-bit uid syscalls */
#define __IGNORE_lchown32
diff --git a/tools/arch/arm/include/uapi/asm/kvm.h b/tools/arch/arm/include/uapi/asm/kvm.h
index af05f8e0903e..6ebd3e6a1fd1 100644
--- a/tools/arch/arm/include/uapi/asm/kvm.h
+++ b/tools/arch/arm/include/uapi/asm/kvm.h
@@ -181,10 +181,23 @@ struct kvm_arch_memory_slot {
#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2
#define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32
#define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT)
+#define KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32
+#define KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \
+ (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT)
#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0
#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff)
#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
#define KVM_DEV_ARM_VGIC_GRP_CTRL 4
+#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
+#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
+#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
+ (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
+#define VGIC_LEVEL_INFO_LINE_LEVEL 0
+
#define KVM_DEV_ARM_VGIC_CTRL_INIT 0
/* KVM_IRQ_LINE irq field index values */
diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h
index 3051f86a9b5f..c2860358ae3e 100644
--- a/tools/arch/arm64/include/uapi/asm/kvm.h
+++ b/tools/arch/arm64/include/uapi/asm/kvm.h
@@ -201,10 +201,23 @@ struct kvm_arch_memory_slot {
#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2
#define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32
#define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT)
+#define KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32
+#define KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \
+ (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT)
#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0
#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff)
#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
#define KVM_DEV_ARM_VGIC_GRP_CTRL 4
+#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
+#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
+#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
+ (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
+#define VGIC_LEVEL_INFO_LINE_LEVEL 0
+
#define KVM_DEV_ARM_VGIC_CTRL_INIT 0
/* Device Control API on vcpu fd */
diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h
index 3603b6f51b11..4edbe4bb0e8b 100644
--- a/tools/arch/powerpc/include/uapi/asm/kvm.h
+++ b/tools/arch/powerpc/include/uapi/asm/kvm.h
@@ -413,6 +413,26 @@ struct kvm_get_htab_header {
__u16 n_invalid;
};
+/* For KVM_PPC_CONFIGURE_V3_MMU */
+struct kvm_ppc_mmuv3_cfg {
+ __u64 flags;
+ __u64 process_table; /* second doubleword of partition table entry */
+};
+
+/* Flag values for KVM_PPC_CONFIGURE_V3_MMU */
+#define KVM_PPC_MMUV3_RADIX 1 /* 1 = radix mode, 0 = HPT */
+#define KVM_PPC_MMUV3_GTSE 2 /* global translation shootdown enb. */
+
+/* For KVM_PPC_GET_RMMU_INFO */
+struct kvm_ppc_rmmu_info {
+ struct kvm_ppc_radix_geom {
+ __u8 page_shift;
+ __u8 level_bits[4];
+ __u8 pad[3];
+ } geometries[8];
+ __u32 ap_encodings[8];
+};
+
/* Per-vcpu XICS interrupt controller state */
#define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
@@ -613,5 +633,7 @@ struct kvm_get_htab_header {
#define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40)
#define KVM_XICS_MASKED (1ULL << 41)
#define KVM_XICS_PENDING (1ULL << 42)
+#define KVM_XICS_PRESENTED (1ULL << 43)
+#define KVM_XICS_QUEUED (1ULL << 44)
#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h
index 059e33e94260..328eeceec709 100644
--- a/tools/arch/x86/include/asm/atomic.h
+++ b/tools/arch/x86/include/asm/atomic.h
@@ -7,6 +7,8 @@
#define LOCK_PREFIX "\n\tlock; "
+#include <asm/cmpxchg.h>
+
/*
* Atomic operations that C can't guarantee us. Useful for
* resource counting etc..
@@ -62,4 +64,9 @@ static inline int atomic_dec_and_test(atomic_t *v)
GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
}
+static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+ return cmpxchg(&v->counter, old, new);
+}
+
#endif /* _TOOLS_LINUX_ASM_X86_ATOMIC_H */
diff --git a/tools/arch/x86/include/asm/cmpxchg.h b/tools/arch/x86/include/asm/cmpxchg.h
new file mode 100644
index 000000000000..f5253260f3cc
--- /dev/null
+++ b/tools/arch/x86/include/asm/cmpxchg.h
@@ -0,0 +1,89 @@
+#ifndef TOOLS_ASM_X86_CMPXCHG_H
+#define TOOLS_ASM_X86_CMPXCHG_H
+
+#include <linux/compiler.h>
+
+/*
+ * Non-existant functions to indicate usage errors at link time
+ * (or compile-time if the compiler implements __compiletime_error().
+ */
+extern void __cmpxchg_wrong_size(void)
+ __compiletime_error("Bad argument size for cmpxchg");
+
+/*
+ * Constants for operation sizes. On 32-bit, the 64-bit size it set to
+ * -1 because sizeof will never return -1, thereby making those switch
+ * case statements guaranteeed dead code which the compiler will
+ * eliminate, and allowing the "missing symbol in the default case" to
+ * indicate a usage error.
+ */
+#define __X86_CASE_B 1
+#define __X86_CASE_W 2
+#define __X86_CASE_L 4
+#ifdef __x86_64__
+#define __X86_CASE_Q 8
+#else
+#define __X86_CASE_Q -1 /* sizeof will never return -1 */
+#endif
+
+/*
+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
+ * store NEW in MEM. Return the initial value in MEM. Success is
+ * indicated by comparing RETURN with OLD.
+ */
+#define __raw_cmpxchg(ptr, old, new, size, lock) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (old); \
+ __typeof__(*(ptr)) __new = (new); \
+ switch (size) { \
+ case __X86_CASE_B: \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm volatile(lock "cmpxchgb %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "q" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_W: \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm volatile(lock "cmpxchgw %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_L: \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
+ asm volatile(lock "cmpxchgl %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_Q: \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(ptr); \
+ asm volatile(lock "cmpxchgq %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ default: \
+ __cmpxchg_wrong_size(); \
+ } \
+ __ret; \
+})
+
+#define __cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+
+#define cmpxchg(ptr, old, new) \
+ __cmpxchg(ptr, old, new, sizeof(*(ptr)))
+
+
+#endif /* TOOLS_ASM_X86_CMPXCHG_H */
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 293149a1c6a1..b04bb6dfed7f 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -100,7 +100,7 @@
#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
-/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
+#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */
#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
@@ -186,7 +186,7 @@
*
* Reuse free bits when adding new feature flags!
*/
-
+#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
@@ -289,7 +289,8 @@
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
-#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */
+#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
+#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
@@ -321,5 +322,4 @@
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
-
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index e3fb5ecbdcb6..523911f316ce 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -63,6 +63,7 @@ FEATURE_TESTS_BASIC := \
lzma \
get_cpuid \
bpf \
+ sched_getcpu \
sdt
# FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index b564a2eea039..09c9626ea666 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -48,12 +48,13 @@ FILES= \
test-get_cpuid.bin \
test-sdt.bin \
test-cxx.bin \
- test-jvmti.bin
+ test-jvmti.bin \
+ test-sched_getcpu.bin
FILES := $(addprefix $(OUTPUT),$(FILES))
-CC := $(CROSS_COMPILE)gcc -MD
-CXX := $(CROSS_COMPILE)g++ -MD
+CC ?= $(CROSS_COMPILE)gcc -MD
+CXX ?= $(CROSS_COMPILE)g++ -MD
PKG_CONFIG := $(CROSS_COMPILE)pkg-config
LLVM_CONFIG ?= llvm-config
@@ -91,6 +92,9 @@ $(OUTPUT)test-libelf.bin:
$(OUTPUT)test-glibc.bin:
$(BUILD)
+$(OUTPUT)test-sched_getcpu.bin:
+ $(BUILD)
+
DWARFLIBS := -ldw
ifeq ($(findstring -static,${LDFLAGS}),-static)
DWARFLIBS += -lelf -lebl -lz -llzma -lbz2
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index 699e43627397..cc6c7c01f4ca 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -117,6 +117,10 @@
# include "test-pthread-attr-setaffinity-np.c"
#undef main
+#define main main_test_sched_getcpu
+# include "test-sched_getcpu.c"
+#undef main
+
# if 0
/*
* Disable libbabeltrace check for test-all, because the requested
@@ -182,6 +186,7 @@ int main(int argc, char *argv[])
main_test_get_cpuid();
main_test_bpf();
main_test_libcrypto();
+ main_test_sched_getcpu();
main_test_sdt();
return 0;
diff --git a/tools/build/feature/test-sched_getcpu.c b/tools/build/feature/test-sched_getcpu.c
new file mode 100644
index 000000000000..c4a148dd7104
--- /dev/null
+++ b/tools/build/feature/test-sched_getcpu.c
@@ -0,0 +1,7 @@
+#define _GNU_SOURCE
+#include <sched.h>
+
+int main(void)
+{
+ return sched_getcpu();
+}
diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h
index 2ba78c9f5701..5e9738f97bf3 100644
--- a/tools/include/asm-generic/atomic-gcc.h
+++ b/tools/include/asm-generic/atomic-gcc.h
@@ -60,4 +60,12 @@ static inline int atomic_dec_and_test(atomic_t *v)
return __sync_sub_and_fetch(&v->counter, 1) == 0;
}
+#define cmpxchg(ptr, oldval, newval) \
+ __sync_val_compare_and_swap(ptr, oldval, newval)
+
+static inline int atomic_cmpxchg(atomic_t *v, int oldval, int newval)
+{
+ return cmpxchg(&(v)->counter, oldval, newval);
+}
+
#endif /* __TOOLS_ASM_GENERIC_ATOMIC_H */
diff --git a/tools/include/linux/atomic.h b/tools/include/linux/atomic.h
index 4e3d3d18ebab..9f21fc2b092b 100644
--- a/tools/include/linux/atomic.h
+++ b/tools/include/linux/atomic.h
@@ -3,4 +3,10 @@
#include <asm/atomic.h>
+/* atomic_cmpxchg_relaxed */
+#ifndef atomic_cmpxchg_relaxed
+#define atomic_cmpxchg_relaxed atomic_cmpxchg
+#define atomic_cmpxchg_release atomic_cmpxchg
+#endif /* atomic_cmpxchg_relaxed */
+
#endif /* __TOOLS_LINUX_ATOMIC_H */
diff --git a/tools/include/linux/compiler-gcc.h b/tools/include/linux/compiler-gcc.h
index 48af2f10a42d..616935f1ff56 100644
--- a/tools/include/linux/compiler-gcc.h
+++ b/tools/include/linux/compiler-gcc.h
@@ -12,3 +12,7 @@
#if GCC_VERSION >= 70000 && !defined(__CHECKER__)
# define __fallthrough __attribute__ ((fallthrough))
#endif
+
+#if GCC_VERSION >= 40300
+# define __compiletime_error(message) __attribute__((error(message)))
+#endif /* GCC_VERSION >= 40300 */
diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 8de163b17c0d..c9e65e8faacd 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -5,6 +5,10 @@
#include <linux/compiler-gcc.h>
#endif
+#ifndef __compiletime_error
+# define __compiletime_error(message)
+#endif
+
/* Optimization barrier */
/* The "volatile" is due to gcc bugs */
#define barrier() __asm__ __volatile__("": : :"memory")
diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h
index 28607db02bd3..adb4d0147755 100644
--- a/tools/include/linux/kernel.h
+++ b/tools/include/linux/kernel.h
@@ -5,6 +5,10 @@
#include <stddef.h>
#include <assert.h>
+#ifndef UINT_MAX
+#define UINT_MAX (~0U)
+#endif
+
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
#define PERF_ALIGN(x, a) __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
diff --git a/tools/include/linux/refcount.h b/tools/include/linux/refcount.h
new file mode 100644
index 000000000000..a0177c1f55b1
--- /dev/null
+++ b/tools/include/linux/refcount.h
@@ -0,0 +1,151 @@
+#ifndef _TOOLS_LINUX_REFCOUNT_H
+#define _TOOLS_LINUX_REFCOUNT_H
+
+/*
+ * Variant of atomic_t specialized for reference counts.
+ *
+ * The interface matches the atomic_t interface (to aid in porting) but only
+ * provides the few functions one should use for reference counting.
+ *
+ * It differs in that the counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free issues.
+ *
+ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
+ * and provide only what is strictly required for refcounts.
+ *
+ * The increments are fully relaxed; these will not provide ordering. The
+ * rationale is that whatever is used to obtain the object we're increasing the
+ * reference count on will provide the ordering. For locked data structures,
+ * its the lock acquire, for RCU/lockless data structures its the dependent
+ * load.
+ *
+ * Do note that inc_not_zero() provides a control dependency which will order
+ * future stores against the inc, this ensures we'll never modify the object
+ * if we did not in fact acquire a reference.
+ *
+ * The decrements will provide release order, such that all the prior loads and
+ * stores will be issued before, it also provides a control dependency, which
+ * will order us against the subsequent free().
+ *
+ * The control dependency is against the load of the cmpxchg (ll/sc) that
+ * succeeded. This means the stores aren't fully ordered, but this is fine
+ * because the 1->0 transition indicates no concurrency.
+ *
+ * Note that the allocator is responsible for ordering things between free()
+ * and alloc().
+ *
+ */
+
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+
+#ifdef NDEBUG
+#define REFCOUNT_WARN(cond, str) (void)(cond)
+#define __refcount_check
+#else
+#define REFCOUNT_WARN(cond, str) BUG_ON(cond)
+#define __refcount_check __must_check
+#endif
+
+typedef struct refcount_struct {
+ atomic_t refs;
+} refcount_t;
+
+#define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), }
+
+static inline void refcount_set(refcount_t *r, unsigned int n)
+{
+ atomic_set(&r->refs, n);
+}
+
+static inline unsigned int refcount_read(const refcount_t *r)
+{
+ return atomic_read(&r->refs);
+}
+
+/*
+ * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_inc_not_zero(refcount_t *r)
+{
+ unsigned int old, new, val = atomic_read(&r->refs);
+
+ for (;;) {
+ new = val + 1;
+
+ if (!val)
+ return false;
+
+ if (unlikely(!new))
+ return true;
+
+ old = atomic_cmpxchg_relaxed(&r->refs, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ REFCOUNT_WARN(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n");
+
+ return true;
+}
+
+/*
+ * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller already has a
+ * reference on the object, will WARN when this is not so.
+ */
+static inline void refcount_inc(refcount_t *r)
+{
+ REFCOUNT_WARN(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n");
+}
+
+/*
+ * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
+ * decrement when saturated at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ */
+static inline __refcount_check
+bool refcount_sub_and_test(unsigned int i, refcount_t *r)
+{
+ unsigned int old, new, val = atomic_read(&r->refs);
+
+ for (;;) {
+ if (unlikely(val == UINT_MAX))
+ return false;
+
+ new = val - i;
+ if (new > val) {
+ REFCOUNT_WARN(new > val, "refcount_t: underflow; use-after-free.\n");
+ return false;
+ }
+
+ old = atomic_cmpxchg_release(&r->refs, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ return !new;
+}
+
+static inline __refcount_check
+bool refcount_dec_and_test(refcount_t *r)
+{
+ return refcount_sub_and_test(1, r);
+}
+
+
+#endif /* _ATOMIC_LINUX_REFCOUNT_H */
diff --git a/tools/include/linux/types.h b/tools/include/linux/types.h
index c24b3e3ae296..77a28a26a670 100644
--- a/tools/include/linux/types.h
+++ b/tools/include/linux/types.h
@@ -7,6 +7,7 @@
#define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */
#include <asm/types.h>
+#include <asm/posix_types.h>
struct page;
struct kmem_cache;
diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h
new file mode 100644
index 000000000000..813afd6eee71
--- /dev/null
+++ b/tools/include/uapi/linux/fcntl.h
@@ -0,0 +1,72 @@
+#ifndef _UAPI_LINUX_FCNTL_H
+#define _UAPI_LINUX_FCNTL_H
+
+#include <asm/fcntl.h>
+
+#define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0)
+#define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1)
+
+/*
+ * Cancel a blocking posix lock; internal use only until we expose an
+ * asynchronous lock api to userspace:
+ */
+#define F_CANCELLK (F_LINUX_SPECIFIC_BASE + 5)
+
+/* Create a file descriptor with FD_CLOEXEC set. */
+#define F_DUPFD_CLOEXEC (F_LINUX_SPECIFIC_BASE + 6)
+
+/*
+ * Request nofications on a directory.
+ * See below for events that may be notified.
+ */
+#define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2)
+
+/*
+ * Set and get of pipe page size array
+ */
+#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
+#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
+
+/*
+ * Set/Get seals
+ */
+#define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+
+/*
+ * Types of seals
+ */
+#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
+#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
+#define F_SEAL_GROW 0x0004 /* prevent file from growing */
+#define F_SEAL_WRITE 0x0008 /* prevent writes */
+/* (1U << 31) is reserved for signed error codes */
+
+/*
+ * Types of directory notifications that may be requested.
+ */
+#define DN_ACCESS 0x00000001 /* File accessed */
+#define DN_MODIFY 0x00000002 /* File modified */
+#define DN_CREATE 0x00000004 /* File created */
+#define DN_DELETE 0x00000008 /* File removed */
+#define DN_RENAME 0x00000010 /* File renamed */
+#define DN_ATTRIB 0x00000020 /* File changed attibutes */
+#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */
+
+#define AT_FDCWD -100 /* Special value used to indicate
+ openat should use the current
+ working directory. */
+#define AT_SYMLINK_NOFOLLOW 0x100 /* Do not follow symbolic links. */
+#define AT_REMOVEDIR 0x200 /* Remove directory instead of
+ unlinking file. */
+#define AT_SYMLINK_FOLLOW 0x400 /* Follow symbolic links. */
+#define AT_NO_AUTOMOUNT 0x800 /* Suppress terminal automount traversal */
+#define AT_EMPTY_PATH 0x1000 /* Allow empty relative pathname */
+
+#define AT_STATX_SYNC_TYPE 0x6000 /* Type of synchronisation required from statx() */
+#define AT_STATX_SYNC_AS_STAT 0x0000 /* - Do whatever stat() does */
+#define AT_STATX_FORCE_SYNC 0x2000 /* - Force the attributes to be sync'd with the server */
+#define AT_STATX_DONT_SYNC 0x4000 /* - Don't sync attributes with the server */
+
+
+#endif /* _UAPI_LINUX_FCNTL_H */
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index c66a485a24ac..d09a9cd021b1 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -344,7 +344,8 @@ struct perf_event_attr {
use_clockid : 1, /* use @clockid for time fields */
context_switch : 1, /* context switch data */
write_backward : 1, /* Write ring buffer from end to beginning */
- __reserved_1 : 36;
+ namespaces : 1, /* include namespaces data */
+ __reserved_1 : 35;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -610,6 +611,23 @@ struct perf_event_header {
__u16 size;
};
+struct perf_ns_link_info {
+ __u64 dev;
+ __u64 ino;
+};
+
+enum {
+ NET_NS_INDEX = 0,
+ UTS_NS_INDEX = 1,
+ IPC_NS_INDEX = 2,
+ PID_NS_INDEX = 3,
+ USER_NS_INDEX = 4,
+ MNT_NS_INDEX = 5,
+ CGROUP_NS_INDEX = 6,
+
+ NR_NAMESPACES, /* number of available namespaces */
+};
+
enum perf_event_type {
/*
@@ -862,6 +880,18 @@ enum perf_event_type {
*/
PERF_RECORD_SWITCH_CPU_WIDE = 15,
+ /*
+ * struct {
+ * struct perf_event_header header;
+ * u32 pid;
+ * u32 tid;
+ * u64 nr_namespaces;
+ * { u64 dev, inode; } [nr_namespaces];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_NAMESPACES = 16,
+
PERF_RECORD_MAX, /* non-ABI */
};
@@ -885,6 +915,7 @@ enum perf_callchain_context {
*/
#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */
#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */
+#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */
#define PERF_FLAG_FD_NO_GROUP (1UL << 0)
#define PERF_FLAG_FD_OUTPUT (1UL << 1)
diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h
new file mode 100644
index 000000000000..51a6b86e3700
--- /dev/null
+++ b/tools/include/uapi/linux/stat.h
@@ -0,0 +1,176 @@
+#ifndef _UAPI_LINUX_STAT_H
+#define _UAPI_LINUX_STAT_H
+
+#include <linux/types.h>
+
+#if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2)
+
+#define S_IFMT 00170000
+#define S_IFSOCK 0140000
+#define S_IFLNK 0120000
+#define S_IFREG 0100000
+#define S_IFBLK 0060000
+#define S_IFDIR 0040000
+#define S_IFCHR 0020000
+#define S_IFIFO 0010000
+#define S_ISUID 0004000
+#define S_ISGID 0002000
+#define S_ISVTX 0001000
+
+#define S_ISLNK(m) (((m) & S_IFMT) == S_IFLNK)
+#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG)
+#define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR)
+#define S_ISCHR(m) (((m) & S_IFMT) == S_IFCHR)
+#define S_ISBLK(m) (((m) & S_IFMT) == S_IFBLK)
+#define S_ISFIFO(m) (((m) & S_IFMT) == S_IFIFO)
+#define S_ISSOCK(m) (((m) & S_IFMT) == S_IFSOCK)
+
+#define S_IRWXU 00700
+#define S_IRUSR 00400
+#define S_IWUSR 00200
+#define S_IXUSR 00100
+
+#define S_IRWXG 00070
+#define S_IRGRP 00040
+#define S_IWGRP 00020
+#define S_IXGRP 00010
+
+#define S_IRWXO 00007
+#define S_IROTH 00004
+#define S_IWOTH 00002
+#define S_IXOTH 00001
+
+#endif
+
+/*
+ * Timestamp structure for the timestamps in struct statx.
+ *
+ * tv_sec holds the number of seconds before (negative) or after (positive)
+ * 00:00:00 1st January 1970 UTC.
+ *
+ * tv_nsec holds a number of nanoseconds before (0..-999,999,999 if tv_sec is
+ * negative) or after (0..999,999,999 if tv_sec is positive) the tv_sec time.
+ *
+ * Note that if both tv_sec and tv_nsec are non-zero, then the two values must
+ * either be both positive or both negative.
+ *
+ * __reserved is held in case we need a yet finer resolution.
+ */
+struct statx_timestamp {
+ __s64 tv_sec;
+ __s32 tv_nsec;
+ __s32 __reserved;
+};
+
+/*
+ * Structures for the extended file attribute retrieval system call
+ * (statx()).
+ *
+ * The caller passes a mask of what they're specifically interested in as a
+ * parameter to statx(). What statx() actually got will be indicated in
+ * st_mask upon return.
+ *
+ * For each bit in the mask argument:
+ *
+ * - if the datum is not supported:
+ *
+ * - the bit will be cleared, and
+ *
+ * - the datum will be set to an appropriate fabricated value if one is
+ * available (eg. CIFS can take a default uid and gid), otherwise
+ *
+ * - the field will be cleared;
+ *
+ * - otherwise, if explicitly requested:
+ *
+ * - the datum will be synchronised to the server if AT_STATX_FORCE_SYNC is
+ * set or if the datum is considered out of date, and
+ *
+ * - the field will be filled in and the bit will be set;
+ *
+ * - otherwise, if not requested, but available in approximate form without any
+ * effort, it will be filled in anyway, and the bit will be set upon return
+ * (it might not be up to date, however, and no attempt will be made to
+ * synchronise the internal state first);
+ *
+ * - otherwise the field and the bit will be cleared before returning.
+ *
+ * Items in STATX_BASIC_STATS may be marked unavailable on return, but they
+ * will have values installed for compatibility purposes so that stat() and
+ * co. can be emulated in userspace.
+ */
+struct statx {
+ /* 0x00 */
+ __u32 stx_mask; /* What results were written [uncond] */
+ __u32 stx_blksize; /* Preferred general I/O size [uncond] */
+ __u64 stx_attributes; /* Flags conveying information about the file [uncond] */
+ /* 0x10 */
+ __u32 stx_nlink; /* Number of hard links */
+ __u32 stx_uid; /* User ID of owner */
+ __u32 stx_gid; /* Group ID of owner */
+ __u16 stx_mode; /* File mode */
+ __u16 __spare0[1];
+ /* 0x20 */
+ __u64 stx_ino; /* Inode number */
+ __u64 stx_size; /* File size */
+ __u64 stx_blocks; /* Number of 512-byte blocks allocated */
+ __u64 __spare1[1];
+ /* 0x40 */
+ struct statx_timestamp stx_atime; /* Last access time */
+ struct statx_timestamp stx_btime; /* File creation time */
+ struct statx_timestamp stx_ctime; /* Last attribute change time */
+ struct statx_timestamp stx_mtime; /* Last data modification time */
+ /* 0x80 */
+ __u32 stx_rdev_major; /* Device ID of special file [if bdev/cdev] */
+ __u32 stx_rdev_minor;
+ __u32 stx_dev_major; /* ID of device containing file [uncond] */
+ __u32 stx_dev_minor;
+ /* 0x90 */
+ __u64 __spare2[14]; /* Spare space for future expansion */
+ /* 0x100 */
+};
+
+/*
+ * Flags to be stx_mask
+ *
+ * Query request/result mask for statx() and struct statx::stx_mask.
+ *
+ * These bits should be set in the mask argument of statx() to request
+ * particular items when calling statx().
+ */
+#define STATX_TYPE 0x00000001U /* Want/got stx_mode & S_IFMT */
+#define STATX_MODE 0x00000002U /* Want/got stx_mode & ~S_IFMT */
+#define STATX_NLINK 0x00000004U /* Want/got stx_nlink */
+#define STATX_UID 0x00000008U /* Want/got stx_uid */
+#define STATX_GID 0x00000010U /* Want/got stx_gid */
+#define STATX_ATIME 0x00000020U /* Want/got stx_atime */
+#define STATX_MTIME 0x00000040U /* Want/got stx_mtime */
+#define STATX_CTIME 0x00000080U /* Want/got stx_ctime */
+#define STATX_INO 0x00000100U /* Want/got stx_ino */
+#define STATX_SIZE 0x00000200U /* Want/got stx_size */
+#define STATX_BLOCKS 0x00000400U /* Want/got stx_blocks */
+#define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */
+#define STATX_BTIME 0x00000800U /* Want/got stx_btime */
+#define STATX_ALL 0x00000fffU /* All currently supported flags */
+
+/*
+ * Attributes to be found in stx_attributes
+ *
+ * These give information about the features or the state of a file that might
+ * be of use to ordinary userspace programs such as GUIs or ls rather than
+ * specialised tools.
+ *
+ * Note that the flags marked [I] correspond to generic FS_IOC_FLAGS
+ * semantically. Where possible, the numerical value is picked to correspond
+ * also.
+ */
+#define STATX_ATTR_COMPRESSED 0x00000004 /* [I] File is compressed by the fs */
+#define STATX_ATTR_IMMUTABLE 0x00000010 /* [I] File is marked immutable */
+#define STATX_ATTR_APPEND 0x00000020 /* [I] File is append-only */
+#define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */
+#define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */
+
+#define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */
+
+
+#endif /* _UAPI_LINUX_STAT_H */
diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index 4b6bfc43cccf..809c7721cd24 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -439,6 +439,35 @@ int sysfs__read_str(const char *entry, char **buf, size_t *sizep)
return filename__read_str(path, buf, sizep);
}
+int sysfs__read_bool(const char *entry, bool *value)
+{
+ char *buf;
+ size_t size;
+ int ret;
+
+ ret = sysfs__read_str(entry, &buf, &size);
+ if (ret < 0)
+ return ret;
+
+ switch (buf[0]) {
+ case '1':
+ case 'y':
+ case 'Y':
+ *value = true;
+ break;
+ case '0':
+ case 'n':
+ case 'N':
+ *value = false;
+ break;
+ default:
+ ret = -1;
+ }
+
+ free(buf);
+
+ return ret;
+}
int sysctl__read_int(const char *sysctl, int *value)
{
char path[PATH_MAX];
diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h
index 6b332dc74498..956c21127d1e 100644
--- a/tools/lib/api/fs/fs.h
+++ b/tools/lib/api/fs/fs.h
@@ -37,4 +37,5 @@ int sysctl__read_int(const char *sysctl, int *value);
int sysfs__read_int(const char *entry, int *value);
int sysfs__read_ull(const char *entry, unsigned long long *value);
int sysfs__read_str(const char *entry, char **buf, size_t *sizep);
+int sysfs__read_bool(const char *entry, bool *value);
#endif /* __API_FS__ */
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index 3db3db9278be..643cc4ba6872 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -31,3 +31,5 @@ config.mak.autogen
.config-detected
util/intel-pt-decoder/inat-tables.c
arch/*/include/generated/
+pmu-events/pmu-events.c
+pmu-events/jevents
diff --git a/tools/perf/Build b/tools/perf/Build
index 9b79f8d7db50..bd8eeb60533c 100644
--- a/tools/perf/Build
+++ b/tools/perf/Build
@@ -50,5 +50,6 @@ libperf-y += util/
libperf-y += arch/
libperf-y += ui/
libperf-y += scripts/
+libperf-y += trace/beauty/
gtk-y += ui/gtk/
diff --git a/tools/perf/Documentation/perf-ftrace.txt b/tools/perf/Documentation/perf-ftrace.txt
index 2d96de6132a9..6e6a8b22c859 100644
--- a/tools/perf/Documentation/perf-ftrace.txt
+++ b/tools/perf/Documentation/perf-ftrace.txt
@@ -30,6 +30,24 @@ OPTIONS
--verbose=::
Verbosity level.
+-p::
+--pid=::
+ Trace on existing process id (comma separated list).
+
+-a::
+--all-cpus::
+ Force system-wide collection. Scripts run without a <command>
+ normally use -a by default, while scripts run with a <command>
+ normally don't - this option allows the latter to be run in
+ system-wide mode.
+
+-C::
+--cpu=::
+ Only trace for the list of CPUs provided. Multiple CPUs can
+ be provided as a comma separated list with no space like: 0,1.
+ Ranges of CPUs are specified with -: 0-2.
+ Default is to trace on all online CPUs.
+
SEE ALSO
--------
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 41857cce5e86..f709de54707b 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -8,7 +8,7 @@ perf-list - List all symbolic event types
SYNOPSIS
--------
[verse]
-'perf list' [--no-desc] [--long-desc] [hw|sw|cache|tracepoint|pmu|event_glob]
+'perf list' [--no-desc] [--long-desc] [hw|sw|cache|tracepoint|pmu|sdt|event_glob]
DESCRIPTION
-----------
@@ -24,6 +24,10 @@ Don't print descriptions.
--long-desc::
Print longer event descriptions.
+--details::
+Print how named events are resolved internally into perf events, and also
+any extra expressions computed by perf stat.
+
[[EVENT_MODIFIERS]]
EVENT MODIFIERS
@@ -240,6 +244,8 @@ To limit the list use:
. 'pmu' to print the kernel supplied PMU events.
+. 'sdt' to list all Statically Defined Tracepoint events.
+
. If none of the above is matched, it will apply the supplied glob to all
events, printing the ones that match.
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index b16003ec14a7..ea3789d05e5e 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -347,6 +347,9 @@ Enable weightened sampling. An additional weight is recorded per sample and can
displayed with the weight and local_weight sort keys. This currently works for TSX
abort events and some memory events in precise mode on modern Intel CPUs.
+--namespaces::
+Record events of type PERF_RECORD_NAMESPACES.
+
--transaction::
Record transaction flags for transaction related events.
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index c04cc0647c16..37a175914157 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -72,7 +72,8 @@ OPTIONS
--sort=::
Sort histogram entries by given key(s) - multiple keys can be specified
in CSV format. Following sort keys are available:
- pid, comm, dso, symbol, parent, cpu, socket, srcline, weight, local_weight.
+ pid, comm, dso, symbol, parent, cpu, socket, srcline, weight,
+ local_weight, cgroup_id.
Each key has following meaning:
@@ -80,6 +81,7 @@ OPTIONS
- pid: command and tid of the task
- dso: name of library or module executed at the time of sample
- symbol: name of function executed at the time of sample
+ - symbol_size: size of function executed at the time of sample
- parent: name of function matched to the parent regex filter. Unmatched
entries are displayed as "[other]".
- cpu: cpu number the task ran at the time of sample
@@ -91,6 +93,7 @@ OPTIONS
- weight: Event specific weight, e.g. memory latency or transaction
abort cost. This is the global weight.
- local_weight: Local weight version of the weight above.
+ - cgroup_id: ID derived from cgroup namespace device and inode numbers.
- transaction: Transaction abort flags.
- overhead: Overhead percentage of sample
- overhead_sys: Overhead percentage of sample running in system mode
@@ -172,6 +175,9 @@ OPTIONS
By default, every sort keys not specified in -F will be appended
automatically.
+ If the keys starts with a prefix '+', then it will append the specified
+ field(s) to the default field order. For example: perf report -F +period,sample.
+
-p::
--parent=<regex>::
A regex filter to identify parent. The parent is a caller of this
@@ -229,6 +235,7 @@ OPTIONS
sort_key can be:
- function: compare on functions (default)
- address: compare on individual code addresses
+ - srcline: compare on source filename and line number
branch can be:
- branch: include last branch information in callgraph when available.
@@ -424,6 +431,10 @@ include::itrace.txt[]
--hierarchy::
Enable hierarchical output.
+--inline::
+ If a callgraph address belongs to an inlined function, the inline stack
+ will be printed. Each entry is function name or file/line.
+
include::callchain-overhead-calculation.txt[]
SEE ALSO
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index d33deddb0146..a092a2499e8f 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -132,6 +132,10 @@ OPTIONS for 'perf sched timehist'
--migrations::
Show migration events.
+-n::
+--next::
+ Show next task.
+
-I::
--idle-hist::
Show idle-related events only.
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 4ed5f239ba7d..cb0eda3925e6 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@ OPTIONS
--fields::
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
- srcline, period, iregs, brstack, brstacksym, flags, bpf-output,
+ srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn,
callindent, insn, insnlen. Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace
@@ -189,15 +189,20 @@ OPTIONS
i.e., -F "" is not allowed.
The brstack output includes branch related information with raw addresses using the
- /v/v/v/v/ syntax in the following order:
+ /v/v/v/v/cycles syntax in the following order:
FROM: branch source instruction
TO : branch target instruction
M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
X/- : X=branch inside a transactional region, -=not in transaction region or not supported
A/- : A=TSX abort entry, -=not aborted region or not supported
+ cycles
The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
+ When brstackinsn is specified the full assembler sequences of branch sequences for each sample
+ is printed. This is the full execution path leading to the sample. This is only supported when the
+ sample was recorded with perf record -b or -j any.
+
-k::
--vmlinux=<file>::
vmlinux pathname
@@ -248,6 +253,9 @@ OPTIONS
--show-mmap-events
Display mmap related events (e.g. MMAP, MMAP2).
+--show-namespace-events
+ Display namespace events i.e. events of type PERF_RECORD_NAMESPACES.
+
--show-switch-events
Display context switch events i.e. events of type PERF_RECORD_SWITCH or
PERF_RECORD_SWITCH_CPU_WIDE.
@@ -299,6 +307,10 @@ include::itrace.txt[]
stop time is not given (i.e, time string is 'x.y,') then analysis goes
to end of file.
+--max-blocks::
+ Set the maximum number of program blocks to print with brstackasm for
+ each sample.
+
SEE ALSO
--------
linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index aecf2a87e7d6..bd0e4417f2be 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -94,8 +94,7 @@ to activate system-wide monitoring. Default is to count on all CPUs.
-A::
--no-aggr::
-Do not aggregate counts across all monitored CPUs in system-wide mode (-a).
-This option is only valid in system-wide mode.
+Do not aggregate counts across all monitored CPUs.
-n::
--null::
@@ -237,6 +236,9 @@ To interpret the results it is usually needed to know on which
CPUs the workload runs on. If needed the CPUs can be forced using
taskset.
+--no-merge::
+Do not merge results from same PMUs.
+
EXAMPLES
--------
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
index 8672f835ae4e..89018c7311a4 100644
--- a/tools/perf/MANIFEST
+++ b/tools/perf/MANIFEST
@@ -12,6 +12,7 @@ tools/arch/sparc/include/asm/barrier_32.h
tools/arch/sparc/include/asm/barrier_64.h
tools/arch/tile/include/asm/barrier.h
tools/arch/x86/include/asm/barrier.h
+tools/arch/x86/include/asm/cmpxchg.h
tools/arch/x86/include/asm/cpufeatures.h
tools/arch/x86/include/asm/disabled-features.h
tools/arch/x86/include/asm/required-features.h
@@ -72,12 +73,15 @@ tools/include/uapi/asm-generic/mman-common.h
tools/include/uapi/asm-generic/mman.h
tools/include/uapi/linux/bpf.h
tools/include/uapi/linux/bpf_common.h
+tools/include/uapi/linux/fcntl.h
tools/include/uapi/linux/hw_breakpoint.h
tools/include/uapi/linux/mman.h
tools/include/uapi/linux/perf_event.h
+tools/include/uapi/linux/stat.h
tools/include/linux/poison.h
tools/include/linux/rbtree.h
tools/include/linux/rbtree_augmented.h
+tools/include/linux/refcount.h
tools/include/linux/string.h
tools/include/linux/stringify.h
tools/include/linux/types.h
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 27c9fbca7bd9..2b656de99495 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -317,6 +317,10 @@ ifdef NO_DWARF
NO_LIBDW_DWARF_UNWIND := 1
endif
+ifeq ($(feature-sched_getcpu), 1)
+ CFLAGS += -DHAVE_SCHED_GETCPU_SUPPORT
+endif
+
ifndef NO_LIBELF
CFLAGS += -DHAVE_LIBELF_SUPPORT
EXTLIBS += -lelf
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c
index a3c3e1ce6807..4268f7762e25 100644
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -1,5 +1,10 @@
+#include <string.h>
+#include <regex.h>
+
#include "../../perf.h"
+#include "../../util/util.h"
#include "../../util/perf_regs.h"
+#include "../../util/debug.h"
const struct sample_reg sample_reg_masks[] = {
SMPL_REG(r0, PERF_REG_POWERPC_R0),
@@ -47,3 +52,109 @@ const struct sample_reg sample_reg_masks[] = {
SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
SMPL_REG_END
};
+
+/* REG or %rREG */
+#define SDT_OP_REGEX1 "^(%r)?([1-2]?[0-9]|3[0-1])$"
+
+/* -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) */
+#define SDT_OP_REGEX2 "^(\\-)?([0-9]+)\\((%r)?([1-2]?[0-9]|3[0-1])\\)$"
+
+static regex_t sdt_op_regex1, sdt_op_regex2;
+
+static int sdt_init_op_regex(void)
+{
+ static int initialized;
+ int ret = 0;
+
+ if (initialized)
+ return 0;
+
+ ret = regcomp(&sdt_op_regex1, SDT_OP_REGEX1, REG_EXTENDED);
+ if (ret)
+ goto error;
+
+ ret = regcomp(&sdt_op_regex2, SDT_OP_REGEX2, REG_EXTENDED);
+ if (ret)
+ goto free_regex1;
+
+ initialized = 1;
+ return 0;
+
+free_regex1:
+ regfree(&sdt_op_regex1);
+error:
+ pr_debug4("Regex compilation error.\n");
+ return ret;
+}
+
+/*
+ * Parse OP and convert it into uprobe format, which is, +/-NUM(%gprREG).
+ * Possible variants of OP are:
+ * Format Example
+ * -------------------------
+ * NUM(REG) 48(18)
+ * -NUM(REG) -48(18)
+ * NUM(%rREG) 48(%r18)
+ * -NUM(%rREG) -48(%r18)
+ * REG 18
+ * %rREG %r18
+ * iNUM i0
+ * i-NUM i-1
+ *
+ * SDT marker arguments on Powerpc uses %rREG form with -mregnames flag
+ * and REG form with -mno-regnames. Here REG is general purpose register,
+ * which is in 0 to 31 range.
+ */
+int arch_sdt_arg_parse_op(char *old_op, char **new_op)
+{
+ int ret, new_len;
+ regmatch_t rm[5];
+ char prefix;
+
+ /* Constant argument. Uprobe does not support it */
+ if (old_op[0] == 'i') {
+ pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+ return SDT_ARG_SKIP;
+ }
+
+ ret = sdt_init_op_regex();
+ if (ret < 0)
+ return ret;
+
+ if (!regexec(&sdt_op_regex1, old_op, 3, rm, 0)) {
+ /* REG or %rREG --> %gprREG */
+
+ new_len = 5; /* % g p r NULL */
+ new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
+
+ *new_op = zalloc(new_len);
+ if (!*new_op)
+ return -ENOMEM;
+
+ scnprintf(*new_op, new_len, "%%gpr%.*s",
+ (int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so);
+ } else if (!regexec(&sdt_op_regex2, old_op, 5, rm, 0)) {
+ /*
+ * -NUM(REG) or NUM(REG) or -NUM(%rREG) or NUM(%rREG) -->
+ * +/-NUM(%gprREG)
+ */
+ prefix = (rm[1].rm_so == -1) ? '+' : '-';
+
+ new_len = 8; /* +/- ( % g p r ) NULL */
+ new_len += (int)(rm[2].rm_eo - rm[2].rm_so);
+ new_len += (int)(rm[4].rm_eo - rm[4].rm_so);
+
+ *new_op = zalloc(new_len);
+ if (!*new_op)
+ return -ENOMEM;
+
+ scnprintf(*new_op, new_len, "%c%.*s(%%gpr%.*s)", prefix,
+ (int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
+ (int)(rm[4].rm_eo - rm[4].rm_so), old_op + rm[4].rm_so);
+ } else {
+ pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+ return SDT_ARG_SKIP;
+ }
+
+ return SDT_ARG_VALID;
+}
diff --git a/tools/perf/arch/powerpc/util/sym-handling.c b/tools/perf/arch/powerpc/util/sym-handling.c
index 1030a6e504bb..39dbe512b9fc 100644
--- a/tools/perf/arch/powerpc/util/sym-handling.c
+++ b/tools/perf/arch/powerpc/util/sym-handling.c
@@ -10,6 +10,7 @@
#include "symbol.h"
#include "map.h"
#include "probe-event.h"
+#include "probe-file.h"
#ifdef HAVE_LIBELF_SUPPORT
bool elf__needs_adjust_symbols(GElf_Ehdr ehdr)
@@ -79,13 +80,18 @@ void arch__fix_tev_from_maps(struct perf_probe_event *pev,
* However, if the user specifies an offset, we fall back to using the
* GEP since all userspace applications (objdump/readelf) show function
* disassembly with offsets from the GEP.
- *
- * In addition, we shouldn't specify an offset for kretprobes.
*/
- if (pev->point.offset || (!pev->uprobes && pev->point.retprobe) ||
- !map || !sym)
+ if (pev->point.offset || !map || !sym)
return;
+ /* For kretprobes, add an offset only if the kernel supports it */
+ if (!pev->uprobes && pev->point.retprobe) {
+#ifdef HAVE_LIBELF_SUPPORT
+ if (!kretprobe_offset_is_supported())
+#endif
+ return;
+ }
+
lep_offset = PPC64_LOCAL_ENTRY_OFFSET(sym->arch_sym);
if (map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS)
diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
index e93ef0b38db8..5aef183e2f85 100644
--- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl
@@ -338,6 +338,7 @@
329 common pkey_mprotect sys_pkey_mprotect
330 common pkey_alloc sys_pkey_alloc
331 common pkey_free sys_pkey_free
+332 common statx sys_statx
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/tools/perf/arch/x86/util/perf_regs.c b/tools/perf/arch/x86/util/perf_regs.c
index c5db14f36cc7..3bf3548c5e2d 100644
--- a/tools/perf/arch/x86/util/perf_regs.c
+++ b/tools/perf/arch/x86/util/perf_regs.c
@@ -1,5 +1,10 @@
+#include <string.h>
+#include <regex.h>
+
#include "../../perf.h"
+#include "../../util/util.h"
#include "../../util/perf_regs.h"
+#include "../../util/debug.h"
const struct sample_reg sample_reg_masks[] = {
SMPL_REG(AX, PERF_REG_X86_AX),
@@ -26,3 +31,224 @@ const struct sample_reg sample_reg_masks[] = {
#endif
SMPL_REG_END
};
+
+struct sdt_name_reg {
+ const char *sdt_name;
+ const char *uprobe_name;
+};
+#define SDT_NAME_REG(n, m) {.sdt_name = "%" #n, .uprobe_name = "%" #m}
+#define SDT_NAME_REG_END {.sdt_name = NULL, .uprobe_name = NULL}
+
+static const struct sdt_name_reg sdt_reg_tbl[] = {
+ SDT_NAME_REG(eax, ax),
+ SDT_NAME_REG(rax, ax),
+ SDT_NAME_REG(al, ax),
+ SDT_NAME_REG(ah, ax),
+ SDT_NAME_REG(ebx, bx),
+ SDT_NAME_REG(rbx, bx),
+ SDT_NAME_REG(bl, bx),
+ SDT_NAME_REG(bh, bx),
+ SDT_NAME_REG(ecx, cx),
+ SDT_NAME_REG(rcx, cx),
+ SDT_NAME_REG(cl, cx),
+ SDT_NAME_REG(ch, cx),
+ SDT_NAME_REG(edx, dx),
+ SDT_NAME_REG(rdx, dx),
+ SDT_NAME_REG(dl, dx),
+ SDT_NAME_REG(dh, dx),
+ SDT_NAME_REG(esi, si),
+ SDT_NAME_REG(rsi, si),
+ SDT_NAME_REG(sil, si),
+ SDT_NAME_REG(edi, di),
+ SDT_NAME_REG(rdi, di),
+ SDT_NAME_REG(dil, di),
+ SDT_NAME_REG(ebp, bp),
+ SDT_NAME_REG(rbp, bp),
+ SDT_NAME_REG(bpl, bp),
+ SDT_NAME_REG(rsp, sp),
+ SDT_NAME_REG(esp, sp),
+ SDT_NAME_REG(spl, sp),
+
+ /* rNN registers */
+ SDT_NAME_REG(r8b, r8),
+ SDT_NAME_REG(r8w, r8),
+ SDT_NAME_REG(r8d, r8),
+ SDT_NAME_REG(r9b, r9),
+ SDT_NAME_REG(r9w, r9),
+ SDT_NAME_REG(r9d, r9),
+ SDT_NAME_REG(r10b, r10),
+ SDT_NAME_REG(r10w, r10),
+ SDT_NAME_REG(r10d, r10),
+ SDT_NAME_REG(r11b, r11),
+ SDT_NAME_REG(r11w, r11),
+ SDT_NAME_REG(r11d, r11),
+ SDT_NAME_REG(r12b, r12),
+ SDT_NAME_REG(r12w, r12),
+ SDT_NAME_REG(r12d, r12),
+ SDT_NAME_REG(r13b, r13),
+ SDT_NAME_REG(r13w, r13),
+ SDT_NAME_REG(r13d, r13),
+ SDT_NAME_REG(r14b, r14),
+ SDT_NAME_REG(r14w, r14),
+ SDT_NAME_REG(r14d, r14),
+ SDT_NAME_REG(r15b, r15),
+ SDT_NAME_REG(r15w, r15),
+ SDT_NAME_REG(r15d, r15),
+ SDT_NAME_REG_END,
+};
+
+/*
+ * Perf only supports OP which is in +/-NUM(REG) form.
+ * Here plus-minus sign, NUM and parenthesis are optional,
+ * only REG is mandatory.
+ *
+ * SDT events also supports indirect addressing mode with a
+ * symbol as offset, scaled mode and constants in OP. But
+ * perf does not support them yet. Below are few examples.
+ *
+ * OP with scaled mode:
+ * (%rax,%rsi,8)
+ * 10(%ras,%rsi,8)
+ *
+ * OP with indirect addressing mode:
+ * check_action(%rip)
+ * mp_+52(%rip)
+ * 44+mp_(%rip)
+ *
+ * OP with constant values:
+ * $0
+ * $123
+ * $-1
+ */
+#define SDT_OP_REGEX "^([+\\-]?)([0-9]*)(\\(?)(%[a-z][a-z0-9]+)(\\)?)$"
+
+static regex_t sdt_op_regex;
+
+static int sdt_init_op_regex(void)
+{
+ static int initialized;
+ int ret = 0;
+
+ if (initialized)
+ return 0;
+
+ ret = regcomp(&sdt_op_regex, SDT_OP_REGEX, REG_EXTENDED);
+ if (ret < 0) {
+ pr_debug4("Regex compilation error.\n");
+ return ret;
+ }
+
+ initialized = 1;
+ return 0;
+}
+
+/*
+ * Max x86 register name length is 5(ex: %r15d). So, 6th char
+ * should always contain NULL. This helps to find register name
+ * length using strlen, insted of maintaing one more variable.
+ */
+#define SDT_REG_NAME_SIZE 6
+
+/*
+ * The uprobe parser does not support all gas register names;
+ * so, we have to replace them (ex. for x86_64: %rax -> %ax).
+ * Note: If register does not require renaming, just copy
+ * paste as it is, but don't leave it empty.
+ */
+static void sdt_rename_register(char *sdt_reg, int sdt_len, char *uprobe_reg)
+{
+ int i = 0;
+
+ for (i = 0; sdt_reg_tbl[i].sdt_name != NULL; i++) {
+ if (!strncmp(sdt_reg_tbl[i].sdt_name, sdt_reg, sdt_len)) {
+ strcpy(uprobe_reg, sdt_reg_tbl[i].uprobe_name);
+ return;
+ }
+ }
+
+ strncpy(uprobe_reg, sdt_reg, sdt_len);
+}
+
+int arch_sdt_arg_parse_op(char *old_op, char **new_op)
+{
+ char new_reg[SDT_REG_NAME_SIZE] = {0};
+ int new_len = 0, ret;
+ /*
+ * rm[0]: +/-NUM(REG)
+ * rm[1]: +/-
+ * rm[2]: NUM
+ * rm[3]: (
+ * rm[4]: REG
+ * rm[5]: )
+ */
+ regmatch_t rm[6];
+ /*
+ * Max prefix length is 2 as it may contains sign(+/-)
+ * and displacement 0 (Both sign and displacement 0 are
+ * optional so it may be empty). Use one more character
+ * to hold last NULL so that strlen can be used to find
+ * prefix length, instead of maintaing one more variable.
+ */
+ char prefix[3] = {0};
+
+ ret = sdt_init_op_regex();
+ if (ret < 0)
+ return ret;
+
+ /*
+ * If unsupported OR does not match with regex OR
+ * register name too long, skip it.
+ */
+ if (strchr(old_op, ',') || strchr(old_op, '$') ||
+ regexec(&sdt_op_regex, old_op, 6, rm, 0) ||
+ rm[4].rm_eo - rm[4].rm_so > SDT_REG_NAME_SIZE) {
+ pr_debug4("Skipping unsupported SDT argument: %s\n", old_op);
+ return SDT_ARG_SKIP;
+ }
+
+ /*
+ * Prepare prefix.
+ * If SDT OP has parenthesis but does not provide
+ * displacement, add 0 for displacement.
+ * SDT Uprobe Prefix
+ * -----------------------------
+ * +24(%rdi) +24(%di) +
+ * 24(%rdi) +24(%di) +
+ * %rdi %di
+ * (%rdi) +0(%di) +0
+ * -80(%rbx) -80(%bx) -
+ */
+ if (rm[3].rm_so != rm[3].rm_eo) {
+ if (rm[1].rm_so != rm[1].rm_eo)
+ prefix[0] = *(old_op + rm[1].rm_so);
+ else if (rm[2].rm_so != rm[2].rm_eo)
+ prefix[0] = '+';
+ else
+ strncpy(prefix, "+0", 2);
+ }
+
+ /* Rename register */
+ sdt_rename_register(old_op + rm[4].rm_so, rm[4].rm_eo - rm[4].rm_so,
+ new_reg);
+
+ /* Prepare final OP which should be valid for uprobe_events */
+ new_len = strlen(prefix) +
+ (rm[2].rm_eo - rm[2].rm_so) +
+ (rm[3].rm_eo - rm[3].rm_so) +
+ strlen(new_reg) +
+ (rm[5].rm_eo - rm[5].rm_so) +
+ 1; /* NULL */
+
+ *new_op = zalloc(new_len);
+ if (!*new_op)
+ return -ENOMEM;
+
+ scnprintf(*new_op, new_len, "%.*s%.*s%.*s%.*s%.*s",
+ strlen(prefix), prefix,
+ (int)(rm[2].rm_eo - rm[2].rm_so), old_op + rm[2].rm_so,
+ (int)(rm[3].rm_eo - rm[3].rm_so), old_op + rm[3].rm_so,
+ strlen(new_reg), new_reg,
+ (int)(rm[5].rm_eo - rm[5].rm_so), old_op + rm[5].rm_so);
+
+ return SDT_ARG_VALID;
+}
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index 579a592990dd..842ab2781cdc 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -25,17 +25,17 @@
# endif
#endif
-int bench_numa(int argc, const char **argv, const char *prefix);
-int bench_sched_messaging(int argc, const char **argv, const char *prefix);
-int bench_sched_pipe(int argc, const char **argv, const char *prefix);
-int bench_mem_memcpy(int argc, const char **argv, const char *prefix);
-int bench_mem_memset(int argc, const char **argv, const char *prefix);
-int bench_futex_hash(int argc, const char **argv, const char *prefix);
-int bench_futex_wake(int argc, const char **argv, const char *prefix);
-int bench_futex_wake_parallel(int argc, const char **argv, const char *prefix);
-int bench_futex_requeue(int argc, const char **argv, const char *prefix);
+int bench_numa(int argc, const char **argv);
+int bench_sched_messaging(int argc, const char **argv);
+int bench_sched_pipe(int argc, const char **argv);
+int bench_mem_memcpy(int argc, const char **argv);
+int bench_mem_memset(int argc, const char **argv);
+int bench_futex_hash(int argc, const char **argv);
+int bench_futex_wake(int argc, const char **argv);
+int bench_futex_wake_parallel(int argc, const char **argv);
+int bench_futex_requeue(int argc, const char **argv);
/* pi futexes */
-int bench_futex_lock_pi(int argc, const char **argv, const char *prefix);
+int bench_futex_lock_pi(int argc, const char **argv);
#define BENCH_FORMAT_DEFAULT_STR "default"
#define BENCH_FORMAT_DEFAULT 0
diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index da04b8c5568a..fe16b310097f 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -9,6 +9,7 @@
*/
/* For the CLR_() macros */
+#include <string.h>
#include <pthread.h>
#include <errno.h>
@@ -113,8 +114,7 @@ static void print_summary(void)
(int) runtime.tv_sec);
}
-int bench_futex_hash(int argc, const char **argv,
- const char *prefix __maybe_unused)
+int bench_futex_hash(int argc, const char **argv)
{
int ret = 0;
cpu_set_t cpu;
diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
index 91877777ec6e..73a1c44ea63c 100644
--- a/tools/perf/bench/futex-lock-pi.c
+++ b/tools/perf/bench/futex-lock-pi.c
@@ -3,6 +3,7 @@
*/
/* For the CLR_() macros */
+#include <string.h>
#include <pthread.h>
#include <signal.h>
@@ -139,8 +140,7 @@ static void create_threads(struct worker *w, pthread_attr_t thread_attr)
}
}
-int bench_futex_lock_pi(int argc, const char **argv,
- const char *prefix __maybe_unused)
+int bench_futex_lock_pi(int argc, const char **argv)
{
int ret = 0;
unsigned int i;
diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c
index 2b9705a8734c..41786cbea24c 100644
--- a/tools/perf/bench/futex-requeue.c
+++ b/tools/perf/bench/futex-requeue.c
@@ -9,6 +9,7 @@
*/
/* For the CLR_() macros */
+#include <string.h>
#include <pthread.h>
#include <signal.h>
@@ -108,8 +109,7 @@ static void toggle_done(int sig __maybe_unused,
done = true;
}
-int bench_futex_requeue(int argc, const char **argv,
- const char *prefix __maybe_unused)
+int bench_futex_requeue(int argc, const char **argv)
{
int ret = 0;
unsigned int i, j;
diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c
index 2c8fa67ad537..4ab12c8e016a 100644
--- a/tools/perf/bench/futex-wake-parallel.c
+++ b/tools/perf/bench/futex-wake-parallel.c
@@ -8,6 +8,7 @@
*/
/* For the CLR_() macros */
+#include <string.h>
#include <pthread.h>
#include <signal.h>
@@ -196,8 +197,7 @@ static void toggle_done(int sig __maybe_unused,
done = true;
}
-int bench_futex_wake_parallel(int argc, const char **argv,
- const char *prefix __maybe_unused)
+int bench_futex_wake_parallel(int argc, const char **argv)
{
int ret = 0;
unsigned int i, j;
diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c
index e246b1b8388a..2fa49222ef8d 100644
--- a/tools/perf/bench/futex-wake.c
+++ b/tools/perf/bench/futex-wake.c
@@ -9,6 +9,7 @@
*/
/* For the CLR_() macros */
+#include <string.h>
#include <pthread.h>
#include <signal.h>
@@ -114,8 +115,7 @@ static void toggle_done(int sig __maybe_unused,
done = true;
}
-int bench_futex_wake(int argc, const char **argv,
- const char *prefix __maybe_unused)
+int bench_futex_wake(int argc, const char **argv)
{
int ret = 0;
unsigned int i, j;
diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h
index b2e06d1190d0..e44fd3239530 100644
--- a/tools/perf/bench/futex.h
+++ b/tools/perf/bench/futex.h
@@ -88,13 +88,11 @@ futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wak
#ifndef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
#include <pthread.h>
-static inline int pthread_attr_setaffinity_np(pthread_attr_t *attr,
- size_t cpusetsize,
- cpu_set_t *cpuset)
+#include <linux/compiler.h>
+static inline int pthread_attr_setaffinity_np(pthread_attr_t *attr __maybe_unused,
+ size_t cpusetsize __maybe_unused,
+ cpu_set_t *cpuset __maybe_unused)
{
- attr = attr;
- cpusetsize = cpusetsize;
- cpuset = cpuset;
return 0;
}
#endif
diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c
index 52504a83b5a1..d1dea33dcfcf 100644
--- a/tools/perf/bench/mem-functions.c
+++ b/tools/perf/bench/mem-functions.c
@@ -284,7 +284,7 @@ static const char * const bench_mem_memcpy_usage[] = {
NULL
};
-int bench_mem_memcpy(int argc, const char **argv, const char *prefix __maybe_unused)
+int bench_mem_memcpy(int argc, const char **argv)
{
struct bench_mem_info info = {
.functions = memcpy_functions,
@@ -358,7 +358,7 @@ static const struct function memset_functions[] = {
{ .name = NULL, }
};
-int bench_mem_memset(int argc, const char **argv, const char *prefix __maybe_unused)
+int bench_mem_memset(int argc, const char **argv)
{
struct bench_mem_info info = {
.functions = memset_functions,
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 3083fc36282b..1fe43bd5a012 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -187,7 +187,8 @@ static const struct option options[] = {
OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"),
OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"),
OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
- OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
+ OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details, "
+ "convergence is reached when each process (all its threads) is running on a single NUMA node."),
OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"),
OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "quiet mode"),
OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
@@ -1766,7 +1767,7 @@ static int bench_all(void)
return 0;
}
-int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused)
+int bench_numa(int argc, const char **argv)
{
init_params(&p0, "main,", argc, argv);
argc = parse_options(argc, argv, options, bench_numa_usage, 0);
diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c
index 6a111e775210..4f961e74535b 100644
--- a/tools/perf/bench/sched-messaging.c
+++ b/tools/perf/bench/sched-messaging.c
@@ -260,8 +260,7 @@ static const char * const bench_sched_message_usage[] = {
NULL
};
-int bench_sched_messaging(int argc, const char **argv,
- const char *prefix __maybe_unused)
+int bench_sched_messaging(int argc, const char **argv)
{
unsigned int i, total_children;
struct timeval start, stop, diff;
diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c
index 2243f0150d76..a152737370c5 100644
--- a/tools/perf/bench/sched-pipe.c
+++ b/tools/perf/bench/sched-pipe.c
@@ -76,7 +76,7 @@ static void *worker_thread(void *__tdata)
return NULL;
}
-int bench_sched_pipe(int argc, const char **argv, const char *prefix __maybe_unused)
+int bench_sched_pipe(int argc, const char **argv)
{
struct thread_data threads[2], *td;
int pipe_1[2], pipe_2[2];
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 4f52d85f5ebc..56a7c8d210b9 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -383,7 +383,7 @@ static const char * const annotate_usage[] = {
NULL
};
-int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_annotate(int argc, const char **argv)
{
struct perf_annotate annotate = {
.tool = {
@@ -393,6 +393,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
.comm = perf_event__process_comm,
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
.ordering_requires_timestamps = true,
},
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index a1cddc6bbf0f..445e62881254 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -25,7 +25,7 @@
#include <string.h>
#include <sys/prctl.h>
-typedef int (*bench_fn_t)(int argc, const char **argv, const char *prefix);
+typedef int (*bench_fn_t)(int argc, const char **argv);
struct bench {
const char *name;
@@ -155,7 +155,7 @@ static int bench_str2int(const char *str)
* to something meaningful:
*/
static int run_bench(const char *coll_name, const char *bench_name, bench_fn_t fn,
- int argc, const char **argv, const char *prefix)
+ int argc, const char **argv)
{
int size;
char *name;
@@ -171,7 +171,7 @@ static int run_bench(const char *coll_name, const char *bench_name, bench_fn_t f
prctl(PR_SET_NAME, name);
argv[0] = name;
- ret = fn(argc, argv, prefix);
+ ret = fn(argc, argv);
free(name);
@@ -198,7 +198,7 @@ static void run_collection(struct collection *coll)
fflush(stdout);
argv[1] = bench->name;
- run_bench(coll->name, bench->name, bench->fn, 1, argv, NULL);
+ run_bench(coll->name, bench->name, bench->fn, 1, argv);
printf("\n");
}
}
@@ -211,7 +211,7 @@ static void run_all_collections(void)
run_collection(coll);
}
-int cmd_bench(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_bench(int argc, const char **argv)
{
struct collection *coll;
int ret = 0;
@@ -270,7 +270,7 @@ int cmd_bench(int argc, const char **argv, const char *prefix __maybe_unused)
if (bench_format == BENCH_FORMAT_DEFAULT)
printf("# Running '%s/%s' benchmark:\n", coll->name, bench->name);
fflush(stdout);
- ret = run_bench(coll->name, bench->name, bench->fn, argc-1, argv+1, prefix);
+ ret = run_bench(coll->name, bench->name, bench->fn, argc-1, argv+1);
goto end;
}
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 30e2b2cb2421..94b55eee0d9b 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -276,8 +276,7 @@ static int build_id_cache__update_file(const char *filename)
return err;
}
-int cmd_buildid_cache(int argc, const char **argv,
- const char *prefix __maybe_unused)
+int cmd_buildid_cache(int argc, const char **argv)
{
struct strlist *list;
struct str_node *pos;
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index 5e914ee79eb3..26f4e608207f 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -87,8 +87,7 @@ out:
return 0;
}
-int cmd_buildid_list(int argc, const char **argv,
- const char *prefix __maybe_unused)
+int cmd_buildid_list(int argc, const char **argv)
{
bool show_kernel = false;
bool with_hits = false;
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index e2b21723bbf8..70c2c773a2b8 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -2334,7 +2334,7 @@ out:
static void perf_c2c_display(struct perf_session *session)
{
- if (c2c.use_stdio)
+ if (use_browser == 0)
perf_c2c__hists_fprintf(stdout, session);
else
perf_c2c__hists_browse(&c2c.hists.hists);
@@ -2536,7 +2536,7 @@ static int perf_c2c__report(int argc, const char **argv)
OPT_BOOLEAN(0, "stdio", &c2c.use_stdio, "Use the stdio interface"),
#endif
OPT_BOOLEAN(0, "stats", &c2c.stats_only,
- "Use the stdio interface"),
+ "Display only statistic tables (implies --stdio)"),
OPT_BOOLEAN(0, "full-symbols", &c2c.symbol_full,
"Display full length of symbols"),
OPT_BOOLEAN(0, "no-source", &no_source,
@@ -2755,12 +2755,12 @@ static int perf_c2c__record(int argc, const char **argv)
pr_debug("\n");
}
- ret = cmd_record(i, rec_argv, NULL);
+ ret = cmd_record(i, rec_argv);
free(rec_argv);
return ret;
}
-int cmd_c2c(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_c2c(int argc, const char **argv)
{
argc = parse_options(argc, argv, c2c_options, c2c_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index 8c0d93b7c2f0..55f04f85b049 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -154,7 +154,7 @@ static int parse_config_arg(char *arg, char **var, char **value)
return 0;
}
-int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_config(int argc, const char **argv)
{
int i, ret = 0;
struct perf_config_set *set;
diff --git a/tools/perf/builtin-data.c b/tools/perf/builtin-data.c
index 7ad6e17ac6b3..0adb5f82335a 100644
--- a/tools/perf/builtin-data.c
+++ b/tools/perf/builtin-data.c
@@ -6,7 +6,7 @@
#include "data-convert.h"
#include "data-convert-bt.h"
-typedef int (*data_cmd_fn_t)(int argc, const char **argv, const char *prefix);
+typedef int (*data_cmd_fn_t)(int argc, const char **argv);
struct data_cmd {
const char *name;
@@ -50,8 +50,7 @@ static const char * const data_convert_usage[] = {
NULL
};
-static int cmd_data_convert(int argc, const char **argv,
- const char *prefix __maybe_unused)
+static int cmd_data_convert(int argc, const char **argv)
{
const char *to_ctf = NULL;
struct perf_data_convert_opts opts = {
@@ -98,7 +97,7 @@ static struct data_cmd data_cmds[] = {
{ .name = NULL, },
};
-int cmd_data(int argc, const char **argv, const char *prefix)
+int cmd_data(int argc, const char **argv)
{
struct data_cmd *cmd;
const char *cmdstr;
@@ -118,7 +117,7 @@ int cmd_data(int argc, const char **argv, const char *prefix)
if (strcmp(cmd->name, cmdstr))
continue;
- return cmd->fn(argc, argv, prefix);
+ return cmd->fn(argc, argv);
}
pr_err("Unknown command: %s\n", cmdstr);
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 1b96a3122228..cd2605d86984 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -364,6 +364,7 @@ static struct perf_tool tool = {
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
.lost = perf_event__process_lost,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
.ordering_requires_timestamps = true,
};
@@ -1320,7 +1321,7 @@ static int diff__config(const char *var, const char *value,
return 0;
}
-int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_diff(int argc, const char **argv)
{
int ret = hists__init();
diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c
index e09c4287fe87..6d210e40d611 100644
--- a/tools/perf/builtin-evlist.c
+++ b/tools/perf/builtin-evlist.c
@@ -46,7 +46,7 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details
return 0;
}
-int cmd_evlist(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_evlist(int argc, const char **argv)
{
struct perf_attr_details details = { .verbose = false, };
const struct option options[] = {
diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c
index c3e643666c72..f80fb60b00b0 100644
--- a/tools/perf/builtin-ftrace.c
+++ b/tools/perf/builtin-ftrace.c
@@ -11,11 +11,13 @@
#include <unistd.h>
#include <signal.h>
+#include <fcntl.h>
#include "debug.h"
#include <subcmd/parse-options.h>
#include "evlist.h"
#include "target.h"
+#include "cpumap.h"
#include "thread_map.h"
#include "util/config.h"
@@ -50,11 +52,12 @@ static void ftrace__workload_exec_failed_signal(int signo __maybe_unused,
done = true;
}
-static int write_tracing_file(const char *name, const char *val)
+static int __write_tracing_file(const char *name, const char *val, bool append)
{
char *file;
int fd, ret = -1;
ssize_t size = strlen(val);
+ int flags = O_WRONLY;
file = get_tracing_file(name);
if (!file) {
@@ -62,7 +65,12 @@ static int write_tracing_file(const char *name, const char *val)
return -1;
}
- fd = open(file, O_WRONLY);
+ if (append)
+ flags |= O_APPEND;
+ else
+ flags |= O_TRUNC;
+
+ fd = open(file, flags);
if (fd < 0) {
pr_debug("cannot open tracing file: %s\n", name);
goto out;
@@ -79,6 +87,18 @@ out:
return ret;
}
+static int write_tracing_file(const char *name, const char *val)
+{
+ return __write_tracing_file(name, val, false);
+}
+
+static int append_tracing_file(const char *name, const char *val)
+{
+ return __write_tracing_file(name, val, true);
+}
+
+static int reset_tracing_cpu(void);
+
static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused)
{
if (write_tracing_file("tracing_on", "0") < 0)
@@ -90,14 +110,78 @@ static int reset_tracing_files(struct perf_ftrace *ftrace __maybe_unused)
if (write_tracing_file("set_ftrace_pid", " ") < 0)
return -1;
+ if (reset_tracing_cpu() < 0)
+ return -1;
+
+ return 0;
+}
+
+static int set_tracing_pid(struct perf_ftrace *ftrace)
+{
+ int i;
+ char buf[16];
+
+ if (target__has_cpu(&ftrace->target))
+ return 0;
+
+ for (i = 0; i < thread_map__nr(ftrace->evlist->threads); i++) {
+ scnprintf(buf, sizeof(buf), "%d",
+ ftrace->evlist->threads->map[i]);
+ if (append_tracing_file("set_ftrace_pid", buf) < 0)
+ return -1;
+ }
return 0;
}
+static int set_tracing_cpumask(struct cpu_map *cpumap)
+{
+ char *cpumask;
+ size_t mask_size;
+ int ret;
+ int last_cpu;
+
+ last_cpu = cpu_map__cpu(cpumap, cpumap->nr - 1);
+ mask_size = (last_cpu + 3) / 4 + 1;
+ mask_size += last_cpu / 32; /* ',' is needed for every 32th cpus */
+
+ cpumask = malloc(mask_size);
+ if (cpumask == NULL) {
+ pr_debug("failed to allocate cpu mask\n");
+ return -1;
+ }
+
+ cpu_map__snprint_mask(cpumap, cpumask, mask_size);
+
+ ret = write_tracing_file("tracing_cpumask", cpumask);
+
+ free(cpumask);
+ return ret;
+}
+
+static int set_tracing_cpu(struct perf_ftrace *ftrace)
+{
+ struct cpu_map *cpumap = ftrace->evlist->cpus;
+
+ if (!target__has_cpu(&ftrace->target))
+ return 0;
+
+ return set_tracing_cpumask(cpumap);
+}
+
+static int reset_tracing_cpu(void)
+{
+ struct cpu_map *cpumap = cpu_map__new(NULL);
+ int ret;
+
+ ret = set_tracing_cpumask(cpumap);
+ cpu_map__put(cpumap);
+ return ret;
+}
+
static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
{
char *trace_file;
int trace_fd;
- char *trace_pid;
char buf[4096];
struct pollfd pollfd = {
.events = POLLIN,
@@ -108,42 +192,43 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
return -1;
}
- if (argc < 1)
- return -1;
-
signal(SIGINT, sig_handler);
signal(SIGUSR1, sig_handler);
signal(SIGCHLD, sig_handler);
+ signal(SIGPIPE, sig_handler);
- reset_tracing_files(ftrace);
+ if (reset_tracing_files(ftrace) < 0)
+ goto out;
/* reset ftrace buffer */
if (write_tracing_file("trace", "0") < 0)
goto out;
- if (perf_evlist__prepare_workload(ftrace->evlist, &ftrace->target,
- argv, false, ftrace__workload_exec_failed_signal) < 0)
+ if (argc && perf_evlist__prepare_workload(ftrace->evlist,
+ &ftrace->target, argv, false,
+ ftrace__workload_exec_failed_signal) < 0) {
goto out;
+ }
- if (write_tracing_file("current_tracer", ftrace->tracer) < 0) {
- pr_err("failed to set current_tracer to %s\n", ftrace->tracer);
- goto out;
+ if (set_tracing_pid(ftrace) < 0) {
+ pr_err("failed to set ftrace pid\n");
+ goto out_reset;
}
- if (asprintf(&trace_pid, "%d", thread_map__pid(ftrace->evlist->threads, 0)) < 0) {
- pr_err("failed to allocate pid string\n");
- goto out;
+ if (set_tracing_cpu(ftrace) < 0) {
+ pr_err("failed to set tracing cpumask\n");
+ goto out_reset;
}
- if (write_tracing_file("set_ftrace_pid", trace_pid) < 0) {
- pr_err("failed to set pid: %s\n", trace_pid);
- goto out_free_pid;
+ if (write_tracing_file("current_tracer", ftrace->tracer) < 0) {
+ pr_err("failed to set current_tracer to %s\n", ftrace->tracer);
+ goto out_reset;
}
trace_file = get_tracing_file("trace_pipe");
if (!trace_file) {
pr_err("failed to open trace_pipe\n");
- goto out_free_pid;
+ goto out_reset;
}
trace_fd = open(trace_file, O_RDONLY);
@@ -152,7 +237,7 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
if (trace_fd < 0) {
pr_err("failed to open trace_pipe\n");
- goto out_free_pid;
+ goto out_reset;
}
fcntl(trace_fd, F_SETFL, O_NONBLOCK);
@@ -163,6 +248,8 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
goto out_close_fd;
}
+ setup_pager();
+
perf_evlist__start_workload(ftrace->evlist);
while (!done) {
@@ -191,11 +278,9 @@ static int __cmd_ftrace(struct perf_ftrace *ftrace, int argc, const char **argv)
out_close_fd:
close(trace_fd);
-out_free_pid:
- free(trace_pid);
-out:
+out_reset:
reset_tracing_files(ftrace);
-
+out:
return done ? 0 : -1;
}
@@ -219,7 +304,7 @@ static int perf_ftrace_config(const char *var, const char *value, void *cb)
return -1;
}
-int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_ftrace(int argc, const char **argv)
{
int ret;
struct perf_ftrace ftrace = {
@@ -227,15 +312,21 @@ int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused)
.target = { .uid = UINT_MAX, },
};
const char * const ftrace_usage[] = {
- "perf ftrace [<options>] <command>",
+ "perf ftrace [<options>] [<command>]",
"perf ftrace [<options>] -- <command> [<options>]",
NULL
};
const struct option ftrace_options[] = {
OPT_STRING('t', "tracer", &ftrace.tracer, "tracer",
"tracer to use: function_graph(default) or function"),
+ OPT_STRING('p', "pid", &ftrace.target.pid, "pid",
+ "trace on existing process id"),
OPT_INCR('v', "verbose", &verbose,
"be more verbose"),
+ OPT_BOOLEAN('a', "all-cpus", &ftrace.target.system_wide,
+ "system-wide collection from all CPUs"),
+ OPT_STRING('C', "cpu", &ftrace.target.cpu_list, "cpu",
+ "list of cpus to monitor"),
OPT_END()
};
@@ -245,9 +336,18 @@ int cmd_ftrace(int argc, const char **argv, const char *prefix __maybe_unused)
argc = parse_options(argc, argv, ftrace_options, ftrace_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
- if (!argc)
+ if (!argc && target__none(&ftrace.target))
usage_with_options(ftrace_usage, ftrace_options);
+ ret = target__validate(&ftrace.target);
+ if (ret) {
+ char errbuf[512];
+
+ target__strerror(&ftrace.target, ret, errbuf, 512);
+ pr_err("%s\n", errbuf);
+ return -EINVAL;
+ }
+
ftrace.evlist = perf_evlist__new();
if (ftrace.evlist == NULL)
return -ENOMEM;
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index aed0d844e8c2..1eec96a0fa67 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -301,12 +301,6 @@ void list_common_cmds_help(void)
}
}
-static int is_perf_command(const char *s)
-{
- return is_in_cmdlist(&main_cmds, s) ||
- is_in_cmdlist(&other_cmds, s);
-}
-
static const char *cmd_to_page(const char *perf_cmd)
{
char *s;
@@ -418,7 +412,7 @@ static int show_html_page(const char *perf_cmd)
return 0;
}
-int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_help(int argc, const char **argv)
{
bool show_all = false;
enum help_format help_format = HELP_FORMAT_MAN;
@@ -446,7 +440,6 @@ int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused)
"perf help [--all] [--man|--web|--info] [command]",
NULL
};
- const char *alias;
int rc;
load_command_list("perf-", &main_cmds, &other_cmds);
@@ -472,12 +465,6 @@ int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused)
return 0;
}
- alias = alias_lookup(argv[0]);
- if (alias && !is_perf_command(argv[0])) {
- printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
- return 0;
- }
-
switch (help_format) {
case HELP_FORMAT_MAN:
rc = show_man_page(argv[0]);
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index b9bc7e39833a..42dff0b1375a 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -333,6 +333,18 @@ static int perf_event__repipe_comm(struct perf_tool *tool,
return err;
}
+static int perf_event__repipe_namespaces(struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct machine *machine)
+{
+ int err = perf_event__process_namespaces(tool, event, sample, machine);
+
+ perf_event__repipe(tool, event, sample, machine);
+
+ return err;
+}
+
static int perf_event__repipe_exit(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
@@ -660,6 +672,7 @@ static int __cmd_inject(struct perf_inject *inject)
session->itrace_synth_opts = &inject->itrace_synth_opts;
inject->itrace_synth_opts.inject = true;
inject->tool.comm = perf_event__repipe_comm;
+ inject->tool.namespaces = perf_event__repipe_namespaces;
inject->tool.exit = perf_event__repipe_exit;
inject->tool.id_index = perf_event__repipe_id_index;
inject->tool.auxtrace_info = perf_event__process_auxtrace_info;
@@ -725,7 +738,7 @@ static int __cmd_inject(struct perf_inject *inject)
return ret;
}
-int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_inject(int argc, const char **argv)
{
struct perf_inject inject = {
.tool = {
diff --git a/tools/perf/builtin-kallsyms.c b/tools/perf/builtin-kallsyms.c
index 224bfc454b4a..8ff38c4eb2c0 100644
--- a/tools/perf/builtin-kallsyms.c
+++ b/tools/perf/builtin-kallsyms.c
@@ -43,7 +43,7 @@ static int __cmd_kallsyms(int argc, const char **argv)
return 0;
}
-int cmd_kallsyms(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_kallsyms(int argc, const char **argv)
{
const struct option options[] = {
OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"),
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 6da8d083e4e5..515587825af4 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -964,6 +964,7 @@ static struct perf_tool perf_kmem = {
.comm = perf_event__process_comm,
.mmap = perf_event__process_mmap,
.mmap2 = perf_event__process_mmap2,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
};
@@ -1865,7 +1866,7 @@ static int __cmd_record(int argc, const char **argv)
for (j = 1; j < (unsigned int)argc; j++, i++)
rec_argv[i] = argv[j];
- return cmd_record(i, rec_argv, NULL);
+ return cmd_record(i, rec_argv);
}
static int kmem_config(const char *var, const char *value, void *cb __maybe_unused)
@@ -1884,7 +1885,7 @@ static int kmem_config(const char *var, const char *value, void *cb __maybe_unus
return 0;
}
-int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_kmem(int argc, const char **argv)
{
const char * const default_slab_sort = "frag,hit,bytes";
const char * const default_page_sort = "bytes,hit";
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 08fa88f62a24..38b409173693 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1044,6 +1044,7 @@ static int read_events(struct perf_kvm_stat *kvm)
struct perf_tool eops = {
.sample = process_sample_event,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
};
struct perf_data_file file = {
@@ -1208,7 +1209,7 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
set_option_flag(record_options, 0, "transaction", PARSE_OPT_DISABLED);
record_usage = kvm_stat_record_usage;
- return cmd_record(i, rec_argv, NULL);
+ return cmd_record(i, rec_argv);
}
static int
@@ -1348,6 +1349,7 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
kvm->tool.exit = perf_event__process_exit;
kvm->tool.fork = perf_event__process_fork;
kvm->tool.lost = process_lost_event;
+ kvm->tool.namespaces = perf_event__process_namespaces;
kvm->tool.ordered_events = true;
perf_tool__fill_defaults(&kvm->tool);
@@ -1475,7 +1477,7 @@ static int kvm_cmd_stat(const char *file_name, int argc, const char **argv)
#endif
perf_stat:
- return cmd_stat(argc, argv, NULL);
+ return cmd_stat(argc, argv);
}
#endif /* HAVE_KVM_STAT_SUPPORT */
@@ -1494,7 +1496,7 @@ static int __cmd_record(const char *file_name, int argc, const char **argv)
BUG_ON(i != rec_argc);
- return cmd_record(i, rec_argv, NULL);
+ return cmd_record(i, rec_argv);
}
static int __cmd_report(const char *file_name, int argc, const char **argv)
@@ -1512,7 +1514,7 @@ static int __cmd_report(const char *file_name, int argc, const char **argv)
BUG_ON(i != rec_argc);
- return cmd_report(i, rec_argv, NULL);
+ return cmd_report(i, rec_argv);
}
static int
@@ -1531,10 +1533,10 @@ __cmd_buildid_list(const char *file_name, int argc, const char **argv)
BUG_ON(i != rec_argc);
- return cmd_buildid_list(i, rec_argv, NULL);
+ return cmd_buildid_list(i, rec_argv);
}
-int cmd_kvm(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_kvm(int argc, const char **argv)
{
const char *file_name = NULL;
const struct option kvm_options[] = {
@@ -1589,9 +1591,9 @@ int cmd_kvm(int argc, const char **argv, const char *prefix __maybe_unused)
else if (!strncmp(argv[0], "rep", 3))
return __cmd_report(file_name, argc, argv);
else if (!strncmp(argv[0], "diff", 4))
- return cmd_diff(argc, argv, NULL);
+ return cmd_diff(argc, argv);
else if (!strncmp(argv[0], "top", 3))
- return cmd_top(argc, argv, NULL);
+ return cmd_top(argc, argv);
else if (!strncmp(argv[0], "buildid-list", 12))
return __cmd_buildid_list(file_name, argc, argv);
#ifdef HAVE_KVM_STAT_SUPPORT
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index 3b9d98b5feef..4bf2cb4d25aa 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -18,8 +18,9 @@
#include <subcmd/parse-options.h>
static bool desc_flag = true;
+static bool details_flag;
-int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_list(int argc, const char **argv)
{
int i;
bool raw_dump = false;
@@ -30,6 +31,8 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
"Print extra event descriptions. --no-desc to not print."),
OPT_BOOLEAN('v', "long-desc", &long_desc_flag,
"Print longer event descriptions."),
+ OPT_BOOLEAN(0, "details", &details_flag,
+ "Print information on the perf event names and expressions used internally by events."),
OPT_INCR(0, "debug", &verbose,
"Enable debugging output"),
OPT_END()
@@ -50,7 +53,8 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
printf("\nList of pre-defined events (to be used in -e):\n\n");
if (argc == 0) {
- print_events(NULL, raw_dump, !desc_flag, long_desc_flag);
+ print_events(NULL, raw_dump, !desc_flag, long_desc_flag,
+ details_flag);
return 0;
}
@@ -72,7 +76,7 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
print_hwcache_events(NULL, raw_dump);
else if (strcmp(argv[i], "pmu") == 0)
print_pmu_events(NULL, raw_dump, !desc_flag,
- long_desc_flag);
+ long_desc_flag, details_flag);
else if (strcmp(argv[i], "sdt") == 0)
print_sdt_events(NULL, NULL, raw_dump);
else if ((sep = strchr(argv[i], ':')) != NULL) {
@@ -80,7 +84,8 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
if (sep == NULL) {
print_events(argv[i], raw_dump, !desc_flag,
- long_desc_flag);
+ long_desc_flag,
+ details_flag);
continue;
}
sep_idx = sep - argv[i];
@@ -103,7 +108,8 @@ int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
event_symbols_sw, PERF_COUNT_SW_MAX, raw_dump);
print_hwcache_events(s, raw_dump);
print_pmu_events(s, raw_dump, !desc_flag,
- long_desc_flag);
+ long_desc_flag,
+ details_flag);
print_tracepoint_events(NULL, s, raw_dump);
print_sdt_events(NULL, s, raw_dump);
free(s);
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index ce3bfb48b26f..b686fb6759da 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -858,6 +858,7 @@ static int __cmd_report(bool display_info)
struct perf_tool eops = {
.sample = process_sample_event,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
};
struct perf_data_file file = {
@@ -940,34 +941,36 @@ static int __cmd_record(int argc, const char **argv)
BUG_ON(i != rec_argc);
- ret = cmd_record(i, rec_argv, NULL);
+ ret = cmd_record(i, rec_argv);
free(rec_argv);
return ret;
}
-int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_lock(int argc, const char **argv)
{
- const struct option info_options[] = {
- OPT_BOOLEAN('t', "threads", &info_threads,
- "dump thread list in perf.data"),
- OPT_BOOLEAN('m', "map", &info_map,
- "map of lock instances (address:name table)"),
- OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
- OPT_END()
- };
const struct option lock_options[] = {
OPT_STRING('i', "input", &input_name, "file", "input file name"),
OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"),
OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"),
+ OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
OPT_END()
};
+
+ const struct option info_options[] = {
+ OPT_BOOLEAN('t', "threads", &info_threads,
+ "dump thread list in perf.data"),
+ OPT_BOOLEAN('m', "map", &info_map,
+ "map of lock instances (address:name table)"),
+ OPT_PARENT(lock_options)
+ };
+
const struct option report_options[] = {
OPT_STRING('k', "key", &sort_key, "acquired",
"key for sorting (acquired / contended / avg_wait / wait_total / wait_max / wait_min)"),
- OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
/* TODO: type */
- OPT_END()
+ OPT_PARENT(lock_options)
};
+
const char * const info_usage[] = {
"perf lock info [<options>]",
NULL
@@ -1006,7 +1009,7 @@ int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused)
rc = __cmd_report(false);
} else if (!strcmp(argv[0], "script")) {
/* Aliased to 'perf script' */
- return cmd_script(argc, argv, prefix);
+ return cmd_script(argc, argv);
} else if (!strcmp(argv[0], "info")) {
if (argc) {
argc = parse_options(argc, argv,
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 6114e07ca613..643f4faac0d0 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -129,7 +129,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
pr_debug("\n");
}
- ret = cmd_record(i, rec_argv, NULL);
+ ret = cmd_record(i, rec_argv);
free(rec_argv);
return ret;
}
@@ -256,7 +256,7 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem)
for (j = 1; j < argc; j++, i++)
rep_argv[i] = argv[j];
- ret = cmd_report(i, rep_argv, NULL);
+ ret = cmd_report(i, rep_argv);
free(rep_argv);
return ret;
}
@@ -330,7 +330,7 @@ error:
return ret;
}
-int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_mem(int argc, const char **argv)
{
struct stat st;
struct perf_mem mem = {
@@ -342,6 +342,7 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
.lost = perf_event__process_lost,
.fork = perf_event__process_fork,
.build_id = perf_event__process_build_id,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
},
.input_name = "perf.data",
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index aa7885b92696..cf9f9e9c2fc0 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -442,9 +442,9 @@ static int perf_del_probe_events(struct strfilter *filter)
}
if (ret == -ENOENT && ret2 == -ENOENT)
- pr_debug("\"%s\" does not hit any event.\n", str);
- /* Note that this is silently ignored */
- ret = 0;
+ pr_warning("\"%s\" does not hit any event.\n", str);
+ else
+ ret = 0;
error:
if (kfd >= 0)
@@ -468,7 +468,7 @@ out:
static int
-__cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
+__cmd_probe(int argc, const char **argv)
{
const char * const probe_usage[] = {
"perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]",
@@ -687,13 +687,13 @@ __cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
return 0;
}
-int cmd_probe(int argc, const char **argv, const char *prefix)
+int cmd_probe(int argc, const char **argv)
{
int ret;
ret = init_params();
if (!ret) {
- ret = __cmd_probe(argc, argv, prefix);
+ ret = __cmd_probe(argc, argv);
cleanup_params();
}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index bc84a375295d..3191ab063852 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -876,6 +876,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
signal(SIGTERM, sig_handler);
signal(SIGSEGV, sigsegv_handler);
+ if (rec->opts.record_namespaces)
+ tool->namespace_events = true;
+
if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
signal(SIGUSR2, snapshot_sig_handler);
if (rec->opts.auxtrace_snapshot_mode)
@@ -983,6 +986,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
*/
if (forks) {
union perf_event *event;
+ pid_t tgid;
event = malloc(sizeof(event->comm) + machine->id_hdr_size);
if (event == NULL) {
@@ -996,10 +1000,30 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
* cannot see a correct process name for those events.
* Synthesize COMM event to prevent it.
*/
- perf_event__synthesize_comm(tool, event,
- rec->evlist->workload.pid,
- process_synthesized_event,
- machine);
+ tgid = perf_event__synthesize_comm(tool, event,
+ rec->evlist->workload.pid,
+ process_synthesized_event,
+ machine);
+ free(event);
+
+ if (tgid == -1)
+ goto out_child;
+
+ event = malloc(sizeof(event->namespaces) +
+ (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+ machine->id_hdr_size);
+ if (event == NULL) {
+ err = -ENOMEM;
+ goto out_child;
+ }
+
+ /*
+ * Synthesize NAMESPACES event for the command specified.
+ */
+ perf_event__synthesize_namespaces(tool, event,
+ rec->evlist->workload.pid,
+ tgid, process_synthesized_event,
+ machine);
free(event);
perf_evlist__start_workload(rec->evlist);
@@ -1497,6 +1521,7 @@ static struct record record = {
.fork = perf_event__process_fork,
.exit = perf_event__process_exit,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.mmap = perf_event__process_mmap,
.mmap2 = perf_event__process_mmap2,
.ordered_events = true,
@@ -1611,6 +1636,8 @@ static struct option __record_options[] = {
"opts", "AUX area tracing Snapshot Mode", ""),
OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
"per thread proc mmap processing timeout in ms"),
+ OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
+ "Record namespaces events"),
OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
"Record context switch events"),
OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
@@ -1640,7 +1667,7 @@ static struct option __record_options[] = {
struct option *record_options = __record_options;
-int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_record(int argc, const char **argv)
{
int err;
struct record *rec = &record;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 0a88670e56f3..c18158b83eb1 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -394,8 +394,7 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
fprintf(stdout, "\n\n");
}
- if (sort_order == NULL &&
- parent_pattern == default_parent_pattern)
+ if (!quiet)
fprintf(stdout, "#\n# (%s)\n#\n", help);
if (rep->show_threads) {
@@ -682,7 +681,7 @@ const char report_callchain_help[] = "Display call graph (stack chain/backtrace)
CALLCHAIN_REPORT_HELP
"\n\t\t\t\tDefault: " CALLCHAIN_DEFAULT_OPT;
-int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_report(int argc, const char **argv)
{
struct perf_session *session;
struct itrace_synth_opts itrace_synth_opts = { .set = 0, };
@@ -701,6 +700,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
.mmap = perf_event__process_mmap,
.mmap2 = perf_event__process_mmap2,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
.lost = perf_event__process_lost,
@@ -845,6 +845,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
stdio__config_color, "always"),
OPT_STRING(0, "time", &report.time_str, "str",
"Time span of interest (start,stop)"),
+ OPT_BOOLEAN(0, "inline", &symbol_conf.inline_name,
+ "Show inline function"),
OPT_END()
};
struct perf_data_file file = {
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index b94cf0de715a..79833e226789 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -221,6 +221,7 @@ struct perf_sched {
unsigned int max_stack;
bool show_cpu_visual;
bool show_wakeups;
+ bool show_next;
bool show_migrations;
bool show_state;
u64 skipped_samples;
@@ -1897,14 +1898,18 @@ static char task_state_char(struct thread *thread, int state)
}
static void timehist_print_sample(struct perf_sched *sched,
+ struct perf_evsel *evsel,
struct perf_sample *sample,
struct addr_location *al,
struct thread *thread,
u64 t, int state)
{
struct thread_runtime *tr = thread__priv(thread);
+ const char *next_comm = perf_evsel__strval(evsel, sample, "next_comm");
+ const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
u32 max_cpus = sched->max_cpu + 1;
char tstr[64];
+ char nstr[30];
u64 wait_time;
timestamp__scnprintf_usec(t, tstr, sizeof(tstr));
@@ -1937,7 +1942,12 @@ static void timehist_print_sample(struct perf_sched *sched,
if (sched->show_state)
printf(" %5c ", task_state_char(thread, state));
- if (sched->show_wakeups)
+ if (sched->show_next) {
+ snprintf(nstr, sizeof(nstr), "next: %s[%d]", next_comm, next_pid);
+ printf(" %-*s", comm_width, nstr);
+ }
+
+ if (sched->show_wakeups && !sched->show_next)
printf(" %-*s", comm_width, "");
if (thread->tid == 0)
@@ -2531,7 +2541,7 @@ static int timehist_sched_change_event(struct perf_tool *tool,
}
if (!sched->summary_only)
- timehist_print_sample(sched, sample, &al, thread, t, state);
+ timehist_print_sample(sched, evsel, sample, &al, thread, t, state);
out:
if (sched->hist_time.start == 0 && t >= ptime->start)
@@ -3262,16 +3272,17 @@ static int __cmd_record(int argc, const char **argv)
BUG_ON(i != rec_argc);
- return cmd_record(i, rec_argv, NULL);
+ return cmd_record(i, rec_argv);
}
-int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_sched(int argc, const char **argv)
{
const char default_sort_order[] = "avg, max, switch, runtime";
struct perf_sched sched = {
.tool = {
.sample = perf_sched__process_tracepoint_sample,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.lost = perf_event__process_lost,
.fork = perf_sched__process_fork_event,
.ordered_events = true,
@@ -3340,6 +3351,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
OPT_BOOLEAN('S', "with-summary", &sched.summary,
"Show all syscalls and summary with statistics"),
OPT_BOOLEAN('w', "wakeups", &sched.show_wakeups, "Show wakeup events"),
+ OPT_BOOLEAN('n', "next", &sched.show_next, "Show next task"),
OPT_BOOLEAN('M', "migrations", &sched.show_migrations, "Show migration events"),
OPT_BOOLEAN('V', "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"),
OPT_BOOLEAN('I', "idle-hist", &sched.idle_hist, "Show idle events only"),
@@ -3400,7 +3412,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
* Aliased to 'perf script' for now:
*/
if (!strcmp(argv[0], "script"))
- return cmd_script(argc, argv, prefix);
+ return cmd_script(argc, argv);
if (!strncmp(argv[0], "rec", 3)) {
return __cmd_record(argc, argv);
@@ -3437,10 +3449,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
if (argc)
usage_with_options(timehist_usage, timehist_options);
}
- if (sched.show_wakeups && sched.summary_only) {
- pr_err(" Error: -s and -w are mutually exclusive.\n");
+ if ((sched.show_wakeups || sched.show_next) &&
+ sched.summary_only) {
+ pr_err(" Error: -s and -[n|w] are mutually exclusive.\n");
parse_options_usage(timehist_usage, timehist_options, "s", true);
- parse_options_usage(NULL, timehist_options, "w", true);
+ if (sched.show_wakeups)
+ parse_options_usage(NULL, timehist_options, "w", true);
+ if (sched.show_next)
+ parse_options_usage(NULL, timehist_options, "n", true);
return -EINVAL;
}
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index c0783b4f7b6c..46acc8ece41f 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -28,6 +28,7 @@
#include <linux/time64.h>
#include "asm/bug.h"
#include "util/mem-events.h"
+#include "util/dump-insn.h"
static char const *script_name;
static char const *generate_script_lang;
@@ -42,6 +43,7 @@ static bool nanosecs;
static const char *cpu_list;
static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
static struct perf_stat_config stat_config;
+static int max_blocks;
unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
@@ -69,6 +71,7 @@ enum perf_output_field {
PERF_OUTPUT_CALLINDENT = 1U << 20,
PERF_OUTPUT_INSN = 1U << 21,
PERF_OUTPUT_INSNLEN = 1U << 22,
+ PERF_OUTPUT_BRSTACKINSN = 1U << 23,
};
struct output_option {
@@ -98,6 +101,7 @@ struct output_option {
{.str = "callindent", .field = PERF_OUTPUT_CALLINDENT},
{.str = "insn", .field = PERF_OUTPUT_INSN},
{.str = "insnlen", .field = PERF_OUTPUT_INSNLEN},
+ {.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN},
};
/* default set to maintain compatibility with current format */
@@ -292,7 +296,13 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
"selected. Hence, no address to lookup the source line number.\n");
return -EINVAL;
}
-
+ if (PRINT_FIELD(BRSTACKINSN) &&
+ !(perf_evlist__combined_branch_type(session->evlist) &
+ PERF_SAMPLE_BRANCH_ANY)) {
+ pr_err("Display of branch stack assembler requested, but non all-branch filter set\n"
+ "Hint: run 'perf record -b ...'\n");
+ return -EINVAL;
+ }
if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -546,6 +556,233 @@ static void print_sample_brstacksym(struct perf_sample *sample,
}
}
+#define MAXBB 16384UL
+
+static int grab_bb(u8 *buffer, u64 start, u64 end,
+ struct machine *machine, struct thread *thread,
+ bool *is64bit, u8 *cpumode, bool last)
+{
+ long offset, len;
+ struct addr_location al;
+ bool kernel;
+
+ if (!start || !end)
+ return 0;
+
+ kernel = machine__kernel_ip(machine, start);
+ if (kernel)
+ *cpumode = PERF_RECORD_MISC_KERNEL;
+ else
+ *cpumode = PERF_RECORD_MISC_USER;
+
+ /*
+ * Block overlaps between kernel and user.
+ * This can happen due to ring filtering
+ * On Intel CPUs the entry into the kernel is filtered,
+ * but the exit is not. Let the caller patch it up.
+ */
+ if (kernel != machine__kernel_ip(machine, end)) {
+ printf("\tblock %" PRIx64 "-%" PRIx64 " transfers between kernel and user\n",
+ start, end);
+ return -ENXIO;
+ }
+
+ memset(&al, 0, sizeof(al));
+ if (end - start > MAXBB - MAXINSN) {
+ if (last)
+ printf("\tbrstack does not reach to final jump (%" PRIx64 "-%" PRIx64 ")\n", start, end);
+ else
+ printf("\tblock %" PRIx64 "-%" PRIx64 " (%" PRIu64 ") too long to dump\n", start, end, end - start);
+ return 0;
+ }
+
+ thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
+ if (!al.map || !al.map->dso) {
+ printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end);
+ return 0;
+ }
+ if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
+ printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n", start, end);
+ return 0;
+ }
+
+ /* Load maps to ensure dso->is_64_bit has been updated */
+ map__load(al.map);
+
+ offset = al.map->map_ip(al.map, start);
+ len = dso__data_read_offset(al.map->dso, machine, offset, (u8 *)buffer,
+ end - start + MAXINSN);
+
+ *is64bit = al.map->dso->is_64_bit;
+ if (len <= 0)
+ printf("\tcannot fetch code for block at %" PRIx64 "-%" PRIx64 "\n",
+ start, end);
+ return len;
+}
+
+static void print_jump(uint64_t ip, struct branch_entry *en,
+ struct perf_insn *x, u8 *inbuf, int len,
+ int insn)
+{
+ printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s",
+ ip,
+ dump_insn(x, ip, inbuf, len, NULL),
+ en->flags.predicted ? " PRED" : "",
+ en->flags.mispred ? " MISPRED" : "",
+ en->flags.in_tx ? " INTX" : "",
+ en->flags.abort ? " ABORT" : "");
+ if (en->flags.cycles) {
+ printf(" %d cycles", en->flags.cycles);
+ if (insn)
+ printf(" %.2f IPC", (float)insn / en->flags.cycles);
+ }
+ putchar('\n');
+}
+
+static void print_ip_sym(struct thread *thread, u8 cpumode, int cpu,
+ uint64_t addr, struct symbol **lastsym,
+ struct perf_event_attr *attr)
+{
+ struct addr_location al;
+ int off;
+
+ memset(&al, 0, sizeof(al));
+
+ thread__find_addr_map(thread, cpumode, MAP__FUNCTION, addr, &al);
+ if (!al.map)
+ thread__find_addr_map(thread, cpumode, MAP__VARIABLE,
+ addr, &al);
+ if ((*lastsym) && al.addr >= (*lastsym)->start && al.addr < (*lastsym)->end)
+ return;
+
+ al.cpu = cpu;
+ al.sym = NULL;
+ if (al.map)
+ al.sym = map__find_symbol(al.map, al.addr);
+
+ if (!al.sym)
+ return;
+
+ if (al.addr < al.sym->end)
+ off = al.addr - al.sym->start;
+ else
+ off = al.addr - al.map->start - al.sym->start;
+ printf("\t%s", al.sym->name);
+ if (off)
+ printf("%+d", off);
+ putchar(':');
+ if (PRINT_FIELD(SRCLINE))
+ map__fprintf_srcline(al.map, al.addr, "\t", stdout);
+ putchar('\n');
+ *lastsym = al.sym;
+}
+
+static void print_sample_brstackinsn(struct perf_sample *sample,
+ struct thread *thread,
+ struct perf_event_attr *attr,
+ struct machine *machine)
+{
+ struct branch_stack *br = sample->branch_stack;
+ u64 start, end;
+ int i, insn, len, nr, ilen;
+ struct perf_insn x;
+ u8 buffer[MAXBB];
+ unsigned off;
+ struct symbol *lastsym = NULL;
+
+ if (!(br && br->nr))
+ return;
+ nr = br->nr;
+ if (max_blocks && nr > max_blocks + 1)
+ nr = max_blocks + 1;
+
+ x.thread = thread;
+ x.cpu = sample->cpu;
+
+ putchar('\n');
+
+ /* Handle first from jump, of which we don't know the entry. */
+ len = grab_bb(buffer, br->entries[nr-1].from,
+ br->entries[nr-1].from,
+ machine, thread, &x.is64bit, &x.cpumode, false);
+ if (len > 0) {
+ print_ip_sym(thread, x.cpumode, x.cpu,
+ br->entries[nr - 1].from, &lastsym, attr);
+ print_jump(br->entries[nr - 1].from, &br->entries[nr - 1],
+ &x, buffer, len, 0);
+ }
+
+ /* Print all blocks */
+ for (i = nr - 2; i >= 0; i--) {
+ if (br->entries[i].from || br->entries[i].to)
+ pr_debug("%d: %" PRIx64 "-%" PRIx64 "\n", i,
+ br->entries[i].from,
+ br->entries[i].to);
+ start = br->entries[i + 1].to;
+ end = br->entries[i].from;
+
+ len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
+ /* Patch up missing kernel transfers due to ring filters */
+ if (len == -ENXIO && i > 0) {
+ end = br->entries[--i].from;
+ pr_debug("\tpatching up to %" PRIx64 "-%" PRIx64 "\n", start, end);
+ len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, false);
+ }
+ if (len <= 0)
+ continue;
+
+ insn = 0;
+ for (off = 0;; off += ilen) {
+ uint64_t ip = start + off;
+
+ print_ip_sym(thread, x.cpumode, x.cpu, ip, &lastsym, attr);
+ if (ip == end) {
+ print_jump(ip, &br->entries[i], &x, buffer + off, len - off, insn);
+ break;
+ } else {
+ printf("\t%016" PRIx64 "\t%s\n", ip,
+ dump_insn(&x, ip, buffer + off, len - off, &ilen));
+ if (ilen == 0)
+ break;
+ insn++;
+ }
+ }
+ }
+
+ /*
+ * Hit the branch? In this case we are already done, and the target
+ * has not been executed yet.
+ */
+ if (br->entries[0].from == sample->ip)
+ return;
+ if (br->entries[0].flags.abort)
+ return;
+
+ /*
+ * Print final block upto sample
+ */
+ start = br->entries[0].to;
+ end = sample->ip;
+ len = grab_bb(buffer, start, end, machine, thread, &x.is64bit, &x.cpumode, true);
+ print_ip_sym(thread, x.cpumode, x.cpu, start, &lastsym, attr);
+ if (len <= 0) {
+ /* Print at least last IP if basic block did not work */
+ len = grab_bb(buffer, sample->ip, sample->ip,
+ machine, thread, &x.is64bit, &x.cpumode, false);
+ if (len <= 0)
+ return;
+
+ printf("\t%016" PRIx64 "\t%s\n", sample->ip,
+ dump_insn(&x, sample->ip, buffer, len, NULL));
+ return;
+ }
+ for (off = 0; off <= end - start; off += ilen) {
+ printf("\t%016" PRIx64 "\t%s\n", start + off,
+ dump_insn(&x, start + off, buffer + off, len - off, &ilen));
+ if (ilen == 0)
+ break;
+ }
+}
static void print_sample_addr(struct perf_sample *sample,
struct thread *thread,
@@ -632,7 +869,9 @@ static void print_sample_callindent(struct perf_sample *sample,
}
static void print_insn(struct perf_sample *sample,
- struct perf_event_attr *attr)
+ struct perf_event_attr *attr,
+ struct thread *thread,
+ struct machine *machine)
{
if (PRINT_FIELD(INSNLEN))
printf(" ilen: %d", sample->insn_len);
@@ -643,12 +882,15 @@ static void print_insn(struct perf_sample *sample,
for (i = 0; i < sample->insn_len; i++)
printf(" %02x", (unsigned char)sample->insn[i]);
}
+ if (PRINT_FIELD(BRSTACKINSN))
+ print_sample_brstackinsn(sample, thread, attr, machine);
}
static void print_sample_bts(struct perf_sample *sample,
struct perf_evsel *evsel,
struct thread *thread,
- struct addr_location *al)
+ struct addr_location *al,
+ struct machine *machine)
{
struct perf_event_attr *attr = &evsel->attr;
bool print_srcline_last = false;
@@ -689,7 +931,7 @@ static void print_sample_bts(struct perf_sample *sample,
if (print_srcline_last)
map__fprintf_srcline(al->map, al->addr, "\n ", stdout);
- print_insn(sample, attr);
+ print_insn(sample, attr, thread, machine);
printf("\n");
}
@@ -830,6 +1072,7 @@ struct perf_script {
bool show_task_events;
bool show_mmap_events;
bool show_switch_events;
+ bool show_namespace_events;
bool allocated;
struct cpu_map *cpus;
struct thread_map *threads;
@@ -871,7 +1114,8 @@ static size_t data_src__printf(u64 data_src)
static void process_event(struct perf_script *script,
struct perf_sample *sample, struct perf_evsel *evsel,
- struct addr_location *al)
+ struct addr_location *al,
+ struct machine *machine)
{
struct thread *thread = al->thread;
struct perf_event_attr *attr = &evsel->attr;
@@ -898,7 +1142,7 @@ static void process_event(struct perf_script *script,
print_sample_flags(sample->flags);
if (is_bts_event(attr)) {
- print_sample_bts(sample, evsel, thread, al);
+ print_sample_bts(sample, evsel, thread, al, machine);
return;
}
@@ -936,7 +1180,7 @@ static void process_event(struct perf_script *script,
if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
print_sample_bpf_output(sample);
- print_insn(sample, attr);
+ print_insn(sample, attr, thread, machine);
printf("\n");
}
@@ -1046,7 +1290,7 @@ static int process_sample_event(struct perf_tool *tool,
if (scripting_ops)
scripting_ops->process_event(event, sample, evsel, &al);
else
- process_event(scr, sample, evsel, &al);
+ process_event(scr, sample, evsel, &al, machine);
out_put:
addr_location__put(&al);
@@ -1118,6 +1362,41 @@ out:
return ret;
}
+static int process_namespaces_event(struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct machine *machine)
+{
+ struct thread *thread;
+ struct perf_script *script = container_of(tool, struct perf_script, tool);
+ struct perf_session *session = script->session;
+ struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, sample->id);
+ int ret = -1;
+
+ thread = machine__findnew_thread(machine, event->namespaces.pid,
+ event->namespaces.tid);
+ if (thread == NULL) {
+ pr_debug("problem processing NAMESPACES event, skipping it.\n");
+ return -1;
+ }
+
+ if (perf_event__process_namespaces(tool, event, sample, machine) < 0)
+ goto out;
+
+ if (!evsel->attr.sample_id_all) {
+ sample->cpu = 0;
+ sample->time = 0;
+ sample->tid = event->namespaces.tid;
+ sample->pid = event->namespaces.pid;
+ }
+ print_sample_start(sample, thread, evsel);
+ perf_event__fprintf(event, stdout);
+ ret = 0;
+out:
+ thread__put(thread);
+ return ret;
+}
+
static int process_fork_event(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
@@ -1293,6 +1572,8 @@ static int __cmd_script(struct perf_script *script)
}
if (script->show_switch_events)
script->tool.context_switch = process_switch_event;
+ if (script->show_namespace_events)
+ script->tool.namespaces = process_namespaces_event;
ret = perf_session__process_events(script->session);
@@ -2078,7 +2359,7 @@ int process_cpu_map_event(struct perf_tool *tool __maybe_unused,
return set_maps(script);
}
-int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_script(int argc, const char **argv)
{
bool show_full_info = false;
bool header = false;
@@ -2097,6 +2378,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
.mmap = perf_event__process_mmap,
.mmap2 = perf_event__process_mmap2,
.comm = perf_event__process_comm,
+ .namespaces = perf_event__process_namespaces,
.exit = perf_event__process_exit,
.fork = perf_event__process_fork,
.attr = process_attr,
@@ -2152,7 +2434,8 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
"Valid types: hw,sw,trace,raw. "
"Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
"addr,symoff,period,iregs,brstack,brstacksym,flags,"
- "bpf-output,callindent,insn,insnlen", parse_output_fields),
+ "bpf-output,callindent,insn,insnlen,brstackinsn",
+ parse_output_fields),
OPT_BOOLEAN('a', "all-cpus", &system_wide,
"system-wide collection from all CPUs"),
OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
@@ -2180,7 +2463,11 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
"Show the mmap events"),
OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
"Show context switch events (if recorded)"),
+ OPT_BOOLEAN('\0', "show-namespace-events", &script.show_namespace_events,
+ "Show namespace events (if recorded)"),
OPT_BOOLEAN('f', "force", &symbol_conf.force, "don't complain, do it"),
+ OPT_INTEGER(0, "max-blocks", &max_blocks,
+ "Maximum number of code blocks to dump with brstackinsn"),
OPT_BOOLEAN(0, "ns", &nanosecs,
"Use 9 decimal places when displaying time"),
OPT_CALLBACK_OPTARG(0, "itrace", &itrace_synth_opts, NULL, "opts",
@@ -2217,7 +2504,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
rec_script_path = get_script_path(argv[1], RECORD_SUFFIX);
if (!rec_script_path)
- return cmd_record(argc, argv, NULL);
+ return cmd_record(argc, argv);
}
if (argc > 1 && !strncmp(argv[0], "rep", strlen("rep"))) {
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 13b54999ad79..2158ea14da57 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -140,12 +140,14 @@ static unsigned int unit_width = 4; /* strlen("unit") */
static bool forever = false;
static bool metric_only = false;
static bool force_metric_only = false;
+static bool no_merge = false;
static struct timespec ref_time;
static struct cpu_map *aggr_map;
static aggr_get_id_t aggr_get_id;
static bool append_file;
static const char *output_name;
static int output_fd;
+static int print_free_counters_hint;
struct perf_stat {
bool record;
@@ -1109,6 +1111,9 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
csv_sep);
+ if (counter->supported)
+ print_free_counters_hint = 1;
+
fprintf(stat_config.output, "%-*s%s",
csv_output ? 0 : unit_width,
counter->unit, csv_sep);
@@ -1140,6 +1145,7 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
out.print_metric = pm;
out.new_line = nl;
out.ctx = &os;
+ out.force_header = false;
if (csv_output && !metric_only) {
print_noise(counter, noise);
@@ -1178,11 +1184,81 @@ static void aggr_update_shadow(void)
}
}
+static void collect_all_aliases(struct perf_evsel *counter,
+ void (*cb)(struct perf_evsel *counter, void *data,
+ bool first),
+ void *data)
+{
+ struct perf_evsel *alias;
+
+ alias = list_prepare_entry(counter, &(evsel_list->entries), node);
+ list_for_each_entry_continue (alias, &evsel_list->entries, node) {
+ if (strcmp(perf_evsel__name(alias), perf_evsel__name(counter)) ||
+ alias->scale != counter->scale ||
+ alias->cgrp != counter->cgrp ||
+ strcmp(alias->unit, counter->unit) ||
+ nsec_counter(alias) != nsec_counter(counter))
+ break;
+ alias->merged_stat = true;
+ cb(alias, data, false);
+ }
+}
+
+static bool collect_data(struct perf_evsel *counter,
+ void (*cb)(struct perf_evsel *counter, void *data,
+ bool first),
+ void *data)
+{
+ if (counter->merged_stat)
+ return false;
+ cb(counter, data, true);
+ if (!no_merge)
+ collect_all_aliases(counter, cb, data);
+ return true;
+}
+
+struct aggr_data {
+ u64 ena, run, val;
+ int id;
+ int nr;
+ int cpu;
+};
+
+static void aggr_cb(struct perf_evsel *counter, void *data, bool first)
+{
+ struct aggr_data *ad = data;
+ int cpu, s2;
+
+ for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+ struct perf_counts_values *counts;
+
+ s2 = aggr_get_id(perf_evsel__cpus(counter), cpu);
+ if (s2 != ad->id)
+ continue;
+ if (first)
+ ad->nr++;
+ counts = perf_counts(counter->counts, cpu, 0);
+ /*
+ * When any result is bad, make them all to give
+ * consistent output in interval mode.
+ */
+ if (counts->ena == 0 || counts->run == 0 ||
+ counter->counts->scaled == -1) {
+ ad->ena = 0;
+ ad->run = 0;
+ break;
+ }
+ ad->val += counts->val;
+ ad->ena += counts->ena;
+ ad->run += counts->run;
+ }
+}
+
static void print_aggr(char *prefix)
{
FILE *output = stat_config.output;
struct perf_evsel *counter;
- int cpu, s, s2, id, nr;
+ int s, id, nr;
double uval;
u64 ena, run, val;
bool first;
@@ -1197,23 +1273,21 @@ static void print_aggr(char *prefix)
* Without each counter has its own line.
*/
for (s = 0; s < aggr_map->nr; s++) {
+ struct aggr_data ad;
if (prefix && metric_only)
fprintf(output, "%s", prefix);
- id = aggr_map->map[s];
+ ad.id = id = aggr_map->map[s];
first = true;
evlist__for_each_entry(evsel_list, counter) {
- val = ena = run = 0;
- nr = 0;
- for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
- s2 = aggr_get_id(perf_evsel__cpus(counter), cpu);
- if (s2 != id)
- continue;
- val += perf_counts(counter->counts, cpu, 0)->val;
- ena += perf_counts(counter->counts, cpu, 0)->ena;
- run += perf_counts(counter->counts, cpu, 0)->run;
- nr++;
- }
+ ad.val = ad.ena = ad.run = 0;
+ ad.nr = 0;
+ if (!collect_data(counter, aggr_cb, &ad))
+ continue;
+ nr = ad.nr;
+ ena = ad.ena;
+ run = ad.run;
+ val = ad.val;
if (first && metric_only) {
first = false;
aggr_printout(counter, id, nr);
@@ -1257,6 +1331,21 @@ static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
}
}
+struct caggr_data {
+ double avg, avg_enabled, avg_running;
+};
+
+static void counter_aggr_cb(struct perf_evsel *counter, void *data,
+ bool first __maybe_unused)
+{
+ struct caggr_data *cd = data;
+ struct perf_stat_evsel *ps = counter->priv;
+
+ cd->avg += avg_stats(&ps->res_stats[0]);
+ cd->avg_enabled += avg_stats(&ps->res_stats[1]);
+ cd->avg_running += avg_stats(&ps->res_stats[2]);
+}
+
/*
* Print out the results of a single counter:
* aggregated counts in system-wide mode
@@ -1264,23 +1353,31 @@ static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
{
FILE *output = stat_config.output;
- struct perf_stat_evsel *ps = counter->priv;
- double avg = avg_stats(&ps->res_stats[0]);
double uval;
- double avg_enabled, avg_running;
+ struct caggr_data cd = { .avg = 0.0 };
- avg_enabled = avg_stats(&ps->res_stats[1]);
- avg_running = avg_stats(&ps->res_stats[2]);
+ if (!collect_data(counter, counter_aggr_cb, &cd))
+ return;
if (prefix && !metric_only)
fprintf(output, "%s", prefix);
- uval = avg * counter->scale;
- printout(-1, 0, counter, uval, prefix, avg_running, avg_enabled, avg);
+ uval = cd.avg * counter->scale;
+ printout(-1, 0, counter, uval, prefix, cd.avg_running, cd.avg_enabled, cd.avg);
if (!metric_only)
fprintf(output, "\n");
}
+static void counter_cb(struct perf_evsel *counter, void *data,
+ bool first __maybe_unused)
+{
+ struct aggr_data *ad = data;
+
+ ad->val += perf_counts(counter->counts, ad->cpu, 0)->val;
+ ad->ena += perf_counts(counter->counts, ad->cpu, 0)->ena;
+ ad->run += perf_counts(counter->counts, ad->cpu, 0)->run;
+}
+
/*
* Print out the results of a single counter:
* does not use aggregated count in system-wide
@@ -1293,9 +1390,13 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
int cpu;
for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
- val = perf_counts(counter->counts, cpu, 0)->val;
- ena = perf_counts(counter->counts, cpu, 0)->ena;
- run = perf_counts(counter->counts, cpu, 0)->run;
+ struct aggr_data ad = { .cpu = cpu };
+
+ if (!collect_data(counter, counter_cb, &ad))
+ return;
+ val = ad.val;
+ ena = ad.ena;
+ run = ad.run;
if (prefix)
fprintf(output, "%s", prefix);
@@ -1380,6 +1481,7 @@ static void print_metric_headers(const char *prefix, bool no_indent)
out.ctx = &os;
out.print_metric = print_metric_header;
out.new_line = new_line_metric;
+ out.force_header = true;
os.evsel = counter;
perf_stat__print_shadow_stats(counter, 0,
0,
@@ -1477,6 +1579,13 @@ static void print_footer(void)
avg_stats(&walltime_nsecs_stats));
}
fprintf(output, "\n\n");
+
+ if (print_free_counters_hint)
+ fprintf(output,
+"Some events weren't counted. Try disabling the NMI watchdog:\n"
+" echo 0 > /proc/sys/kernel/nmi_watchdog\n"
+" perf stat ...\n"
+" echo 1 > /proc/sys/kernel/nmi_watchdog\n");
}
static void print_counters(struct timespec *ts, int argc, const char **argv)
@@ -1633,6 +1742,7 @@ static const struct option stat_options[] = {
"list of cpus to monitor in system-wide"),
OPT_SET_UINT('A', "no-aggr", &stat_config.aggr_mode,
"disable CPU count aggregation", AGGR_NONE),
+ OPT_BOOLEAN(0, "no-merge", &no_merge, "Do not merge identical named events"),
OPT_STRING('x', "field-separator", &csv_sep, "separator",
"print counts with custom separator"),
OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
@@ -2339,7 +2449,36 @@ static int __cmd_report(int argc, const char **argv)
return 0;
}
-int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
+static void setup_system_wide(int forks)
+{
+ /*
+ * Make system wide (-a) the default target if
+ * no target was specified and one of following
+ * conditions is met:
+ *
+ * - there's no workload specified
+ * - there is workload specified but all requested
+ * events are system wide events
+ */
+ if (!target__none(&target))
+ return;
+
+ if (!forks)
+ target.system_wide = true;
+ else {
+ struct perf_evsel *counter;
+
+ evlist__for_each_entry(evsel_list, counter) {
+ if (!counter->system_wide)
+ return;
+ }
+
+ if (evsel_list->nr_entries)
+ target.system_wide = true;
+ }
+}
+
+int cmd_stat(int argc, const char **argv)
{
const char * const stat_usage[] = {
"perf stat [<options>] [<command>]",
@@ -2361,6 +2500,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
argc = parse_options_subcommand(argc, argv, stat_options, stat_subcommands,
(const char **) stat_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
+ perf_stat__collect_metric_expr(evsel_list);
perf_stat__init_shadow_stats();
if (csv_sep) {
@@ -2445,9 +2585,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
} else if (big_num_opt == 0) /* User passed --no-big-num */
big_num = false;
- /* Make system wide (-a) the default target. */
- if (!argc && target__none(&target))
- target.system_wide = true;
+ setup_system_wide(argc);
if (run_count < 0) {
pr_err("Run count must be a positive number\n");
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index e7eaa298d34a..fafdb44b8bcb 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -1773,7 +1773,7 @@ static int timechart__io_record(int argc, const char **argv)
for (i = 0; i < (unsigned int)argc; i++)
*p++ = argv[i];
- return cmd_record(rec_argc, rec_argv, NULL);
+ return cmd_record(rec_argc, rec_argv);
}
@@ -1864,7 +1864,7 @@ static int timechart__record(struct timechart *tchart, int argc, const char **ar
for (j = 0; j < (unsigned int)argc; j++)
*p++ = argv[j];
- return cmd_record(rec_argc, rec_argv, NULL);
+ return cmd_record(rec_argc, rec_argv);
}
static int
@@ -1917,8 +1917,7 @@ parse_time(const struct option *opt, const char *arg, int __maybe_unused unset)
return 0;
}
-int cmd_timechart(int argc, const char **argv,
- const char *prefix __maybe_unused)
+int cmd_timechart(int argc, const char **argv)
{
struct timechart tchart = {
.tool = {
@@ -1933,6 +1932,11 @@ int cmd_timechart(int argc, const char **argv,
.merge_dist = 1000,
};
const char *output_name = "output.svg";
+ const struct option timechart_common_options[] = {
+ OPT_BOOLEAN('P', "power-only", &tchart.power_only, "output power data only"),
+ OPT_BOOLEAN('T', "tasks-only", &tchart.tasks_only, "output processes data only"),
+ OPT_END()
+ };
const struct option timechart_options[] = {
OPT_STRING('i', "input", &input_name, "file", "input file name"),
OPT_STRING('o', "output", &output_name, "file", "output file name"),
@@ -1940,9 +1944,6 @@ int cmd_timechart(int argc, const char **argv,
OPT_CALLBACK(0, "highlight", NULL, "duration or task name",
"highlight tasks. Pass duration in ns or process name.",
parse_highlight),
- OPT_BOOLEAN('P', "power-only", &tchart.power_only, "output power data only"),
- OPT_BOOLEAN('T', "tasks-only", &tchart.tasks_only,
- "output processes data only"),
OPT_CALLBACK('p', "process", NULL, "process",
"process selector. Pass a pid or process name.",
parse_process),
@@ -1962,22 +1963,18 @@ int cmd_timechart(int argc, const char **argv,
"merge events that are merge-dist us apart",
parse_time),
OPT_BOOLEAN('f', "force", &tchart.force, "don't complain, do it"),
- OPT_END()
+ OPT_PARENT(timechart_common_options),
};
const char * const timechart_subcommands[] = { "record", NULL };
const char *timechart_usage[] = {
"perf timechart [<options>] {record}",
NULL
};
-
const struct option timechart_record_options[] = {
- OPT_BOOLEAN('P', "power-only", &tchart.power_only, "output power data only"),
- OPT_BOOLEAN('T', "tasks-only", &tchart.tasks_only,
- "output processes data only"),
OPT_BOOLEAN('I', "io-only", &tchart.io_only,
"record only IO data"),
OPT_BOOLEAN('g', "callchain", &tchart.with_backtrace, "record callchain"),
- OPT_END()
+ OPT_PARENT(timechart_common_options),
};
const char * const timechart_record_usage[] = {
"perf timechart record [<options>]",
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index ab9077915763..a0c97c70ec81 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1075,7 +1075,7 @@ parse_percent_limit(const struct option *opt, const char *arg,
const char top_callchain_help[] = CALLCHAIN_RECORD_HELP CALLCHAIN_REPORT_HELP
"\n\t\t\t\tDefault: fp,graph,0.5,caller,function";
-int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_top(int argc, const char **argv)
{
char errbuf[BUFSIZ];
struct perf_top top = {
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 256f1fac6f7e..fce278d5fada 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -31,6 +31,7 @@
#include "util/intlist.h"
#include "util/thread_map.h"
#include "util/stat.h"
+#include "trace/beauty/beauty.h"
#include "trace-event.h"
#include "util/parse-events.h"
#include "util/bpf-loader.h"
@@ -267,15 +268,6 @@ out_delete:
({ struct syscall_tp *fields = evsel->priv; \
fields->name.pointer(&fields->name, sample); })
-struct syscall_arg {
- unsigned long val;
- struct thread *thread;
- struct trace *trace;
- void *parm;
- u8 idx;
- u8 mask;
-};
-
struct strarray {
int offset;
int nr_entries;
@@ -771,6 +763,10 @@ static struct syscall_fmt {
.arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
{ .name = "stat", .errmsg = true, .alias = "newstat", },
{ .name = "statfs", .errmsg = true, },
+ { .name = "statx", .errmsg = true,
+ .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
+ [2] = SCA_STATX_FLAGS, /* flags */
+ [3] = SCA_STATX_MASK, /* mask */ }, },
{ .name = "swapoff", .errmsg = true,
.arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
{ .name = "swapon", .errmsg = true,
@@ -821,12 +817,21 @@ struct syscall {
void **arg_parm;
};
-static size_t fprintf_duration(unsigned long t, FILE *fp)
+/*
+ * We need to have this 'calculated' boolean because in some cases we really
+ * don't know what is the duration of a syscall, for instance, when we start
+ * a session and some threads are waiting for a syscall to finish, say 'poll',
+ * in which case all we can do is to print "( ? ) for duration and for the
+ * start timestamp.
+ */
+static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
{
double duration = (double)t / NSEC_PER_MSEC;
size_t printed = fprintf(fp, "(");
- if (duration >= 1.0)
+ if (!calculated)
+ printed += fprintf(fp, " ? ");
+ else if (duration >= 1.0)
printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
else if (duration >= 0.01)
printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
@@ -1028,13 +1033,27 @@ static bool trace__filter_duration(struct trace *trace, double t)
return t < (trace->duration_filter * NSEC_PER_MSEC);
}
-static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
+static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
{
double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
return fprintf(fp, "%10.3f ", ts);
}
+/*
+ * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
+ * using ttrace->entry_time for a thread that receives a sys_exit without
+ * first having received a sys_enter ("poll" issued before tracing session
+ * starts, lost sys_enter exit due to ring buffer overflow).
+ */
+static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
+{
+ if (tstamp > 0)
+ return __trace__fprintf_tstamp(trace, tstamp, fp);
+
+ return fprintf(fp, " ? ");
+}
+
static bool done = false;
static bool interrupted = false;
@@ -1045,10 +1064,10 @@ static void sig_handler(int sig)
}
static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
- u64 duration, u64 tstamp, FILE *fp)
+ u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
{
size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
- printed += fprintf_duration(duration, fp);
+ printed += fprintf_duration(duration, duration_calculated, fp);
if (trace->multiple_threads) {
if (trace->show_comm)
@@ -1450,7 +1469,7 @@ static int trace__printf_interrupted_entry(struct trace *trace, struct perf_samp
duration = sample->time - ttrace->entry_time;
- printed = trace__fprintf_entry_head(trace, trace->current, duration, ttrace->entry_time, trace->output);
+ printed = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
ttrace->entry_pending = false;
@@ -1497,7 +1516,7 @@ static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
if (sc->is_exit) {
if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
- trace__fprintf_entry_head(trace, thread, 1, ttrace->entry_time, trace->output);
+ trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
}
} else {
@@ -1545,6 +1564,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
{
long ret;
u64 duration = 0;
+ bool duration_calculated = false;
struct thread *thread;
int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
struct syscall *sc = trace__syscall_info(trace, evsel, id);
@@ -1573,6 +1593,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
duration = sample->time - ttrace->entry_time;
if (trace__filter_duration(trace, duration))
goto out;
+ duration_calculated = true;
} else if (trace->duration_filter)
goto out;
@@ -1588,7 +1609,7 @@ static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
if (trace->summary_only)
goto out;
- trace__fprintf_entry_head(trace, thread, duration, ttrace->entry_time, trace->output);
+ trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
if (ttrace->entry_pending) {
fprintf(trace->output, "%-70s", ttrace->entry_str);
@@ -1653,15 +1674,17 @@ static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
ttrace = thread__priv(thread);
if (!ttrace)
- goto out;
+ goto out_put;
filename_len = strlen(filename);
+ if (filename_len == 0)
+ goto out_put;
if (ttrace->filename.namelen < filename_len) {
char *f = realloc(ttrace->filename.name, filename_len + 1);
if (f == NULL)
- goto out;
+ goto out_put;
ttrace->filename.namelen = filename_len;
ttrace->filename.name = f;
@@ -1671,12 +1694,12 @@ static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
ttrace->filename.pending_open = true;
if (!ttrace->filename.ptr)
- goto out;
+ goto out_put;
entry_str_len = strlen(ttrace->entry_str);
remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
if (remaining_space <= 0)
- goto out;
+ goto out_put;
if (filename_len > (size_t)remaining_space) {
filename += filename_len - remaining_space;
@@ -1690,6 +1713,8 @@ static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
ttrace->filename.ptr = 0;
ttrace->filename.entry_str_pos = 0;
+out_put:
+ thread__put(thread);
out:
return 0;
}
@@ -1710,6 +1735,7 @@ static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evs
ttrace->runtime_ms += runtime_ms;
trace->runtime_ms += runtime_ms;
+out_put:
thread__put(thread);
return 0;
@@ -1720,8 +1746,7 @@ out_dump:
(pid_t)perf_evsel__intval(evsel, sample, "pid"),
runtime,
perf_evsel__intval(evsel, sample, "vruntime"));
- thread__put(thread);
- return 0;
+ goto out_put;
}
static void bpf_output__printer(enum binary_printer_ops op,
@@ -1851,7 +1876,7 @@ static int trace__pgfault(struct trace *trace,
thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
sample->ip, &al);
- trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
+ trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
fprintf(trace->output, "%sfault [",
evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
@@ -1920,7 +1945,7 @@ static int trace__process_sample(struct perf_tool *tool,
thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
if (thread && thread__is_filtered(thread))
- return 0;
+ goto out;
trace__set_base_time(trace, evsel, sample);
@@ -1928,7 +1953,8 @@ static int trace__process_sample(struct perf_tool *tool,
++trace->nr_events;
handler(trace, evsel, event, sample);
}
-
+out:
+ thread__put(thread);
return err;
}
@@ -1988,7 +2014,7 @@ static int trace__record(struct trace *trace, int argc, const char **argv)
for (i = 0; i < (unsigned int)argc; i++)
rec_argv[j++] = argv[i];
- return cmd_record(j, rec_argv, NULL);
+ return cmd_record(j, rec_argv);
}
static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
@@ -2415,8 +2441,9 @@ static int trace__replay(struct trace *trace)
trace->tool.exit = perf_event__process_exit;
trace->tool.fork = perf_event__process_fork;
trace->tool.attr = perf_event__process_attr;
- trace->tool.tracing_data = perf_event__process_tracing_data;
+ trace->tool.tracing_data = perf_event__process_tracing_data;
trace->tool.build_id = perf_event__process_build_id;
+ trace->tool.namespaces = perf_event__process_namespaces;
trace->tool.ordered_events = true;
trace->tool.ordering_requires_timestamps = true;
@@ -2785,7 +2812,7 @@ out:
return err;
}
-int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_trace(int argc, const char **argv)
{
const char *trace_usage[] = {
"perf trace [<options>] [<command>]",
diff --git a/tools/perf/builtin-version.c b/tools/perf/builtin-version.c
index 9b10cda6b6dc..b9a095b1db99 100644
--- a/tools/perf/builtin-version.c
+++ b/tools/perf/builtin-version.c
@@ -2,8 +2,7 @@
#include "builtin.h"
#include "perf.h"
-int cmd_version(int argc __maybe_unused, const char **argv __maybe_unused,
- const char *prefix __maybe_unused)
+int cmd_version(int argc __maybe_unused, const char **argv __maybe_unused)
{
printf("perf version %s\n", perf_version_string);
return 0;
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 036e1e35b1a8..26669bf9129c 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -13,35 +13,35 @@ void prune_packed_objects(int);
int read_line_with_nul(char *buf, int size, FILE *file);
int check_pager_config(const char *cmd);
-int cmd_annotate(int argc, const char **argv, const char *prefix);
-int cmd_bench(int argc, const char **argv, const char *prefix);
-int cmd_buildid_cache(int argc, const char **argv, const char *prefix);
-int cmd_buildid_list(int argc, const char **argv, const char *prefix);
-int cmd_config(int argc, const char **argv, const char *prefix);
-int cmd_c2c(int argc, const char **argv, const char *prefix);
-int cmd_diff(int argc, const char **argv, const char *prefix);
-int cmd_evlist(int argc, const char **argv, const char *prefix);
-int cmd_help(int argc, const char **argv, const char *prefix);
-int cmd_sched(int argc, const char **argv, const char *prefix);
-int cmd_kallsyms(int argc, const char **argv, const char *prefix);
-int cmd_list(int argc, const char **argv, const char *prefix);
-int cmd_record(int argc, const char **argv, const char *prefix);
-int cmd_report(int argc, const char **argv, const char *prefix);
-int cmd_stat(int argc, const char **argv, const char *prefix);
-int cmd_timechart(int argc, const char **argv, const char *prefix);
-int cmd_top(int argc, const char **argv, const char *prefix);
-int cmd_script(int argc, const char **argv, const char *prefix);
-int cmd_version(int argc, const char **argv, const char *prefix);
-int cmd_probe(int argc, const char **argv, const char *prefix);
-int cmd_kmem(int argc, const char **argv, const char *prefix);
-int cmd_lock(int argc, const char **argv, const char *prefix);
-int cmd_kvm(int argc, const char **argv, const char *prefix);
-int cmd_test(int argc, const char **argv, const char *prefix);
-int cmd_trace(int argc, const char **argv, const char *prefix);
-int cmd_inject(int argc, const char **argv, const char *prefix);
-int cmd_mem(int argc, const char **argv, const char *prefix);
-int cmd_data(int argc, const char **argv, const char *prefix);
-int cmd_ftrace(int argc, const char **argv, const char *prefix);
+int cmd_annotate(int argc, const char **argv);
+int cmd_bench(int argc, const char **argv);
+int cmd_buildid_cache(int argc, const char **argv);
+int cmd_buildid_list(int argc, const char **argv);
+int cmd_config(int argc, const char **argv);
+int cmd_c2c(int argc, const char **argv);
+int cmd_diff(int argc, const char **argv);
+int cmd_evlist(int argc, const char **argv);
+int cmd_help(int argc, const char **argv);
+int cmd_sched(int argc, const char **argv);
+int cmd_kallsyms(int argc, const char **argv);
+int cmd_list(int argc, const char **argv);
+int cmd_record(int argc, const char **argv);
+int cmd_report(int argc, const char **argv);
+int cmd_stat(int argc, const char **argv);
+int cmd_timechart(int argc, const char **argv);
+int cmd_top(int argc, const char **argv);
+int cmd_script(int argc, const char **argv);
+int cmd_version(int argc, const char **argv);
+int cmd_probe(int argc, const char **argv);
+int cmd_kmem(int argc, const char **argv);
+int cmd_lock(int argc, const char **argv);
+int cmd_kvm(int argc, const char **argv);
+int cmd_test(int argc, const char **argv);
+int cmd_trace(int argc, const char **argv);
+int cmd_inject(int argc, const char **argv);
+int cmd_mem(int argc, const char **argv);
+int cmd_data(int argc, const char **argv);
+int cmd_ftrace(int argc, const char **argv);
int find_scripts(char **scripts_array, char **scripts_path_array);
#endif
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index c747bfd7f14d..83fe2202382e 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -1,7 +1,9 @@
#!/bin/sh
HEADERS='
+include/uapi/linux/fcntl.h
include/uapi/linux/perf_event.h
+include/uapi/linux/stat.h
include/linux/hash.h
include/uapi/linux/hw_breakpoint.h
arch/x86/include/asm/disabled-features.h
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index ac3efd396a72..2d0caf20ff3a 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -9,6 +9,7 @@ perf-buildid-cache mainporcelain common
perf-buildid-list mainporcelain common
perf-data mainporcelain common
perf-diff mainporcelain common
+perf-c2c mainporcelain common
perf-config mainporcelain common
perf-evlist mainporcelain common
perf-ftrace mainporcelain common
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 6d5479e03e0d..9dc346f2b255 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -34,7 +34,7 @@ const char *input_name;
struct cmd_struct {
const char *cmd;
- int (*fn)(int, const char **, const char *);
+ int (*fn)(int, const char **);
int option;
};
@@ -267,71 +267,6 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
return handled;
}
-static int handle_alias(int *argcp, const char ***argv)
-{
- int envchanged = 0, ret = 0, saved_errno = errno;
- int count, option_count;
- const char **new_argv;
- const char *alias_command;
- char *alias_string;
-
- alias_command = (*argv)[0];
- alias_string = alias_lookup(alias_command);
- if (alias_string) {
- if (alias_string[0] == '!') {
- if (*argcp > 1) {
- struct strbuf buf;
-
- if (strbuf_init(&buf, PATH_MAX) < 0 ||
- strbuf_addstr(&buf, alias_string) < 0 ||
- sq_quote_argv(&buf, (*argv) + 1,
- PATH_MAX) < 0)
- die("Failed to allocate memory.");
- free(alias_string);
- alias_string = buf.buf;
- }
- ret = system(alias_string + 1);
- if (ret >= 0 && WIFEXITED(ret) &&
- WEXITSTATUS(ret) != 127)
- exit(WEXITSTATUS(ret));
- die("Failed to run '%s' when expanding alias '%s'",
- alias_string + 1, alias_command);
- }
- count = split_cmdline(alias_string, &new_argv);
- if (count < 0)
- die("Bad alias.%s string", alias_command);
- option_count = handle_options(&new_argv, &count, &envchanged);
- if (envchanged)
- die("alias '%s' changes environment variables\n"
- "You can use '!perf' in the alias to do this.",
- alias_command);
- memmove(new_argv - option_count, new_argv,
- count * sizeof(char *));
- new_argv -= option_count;
-
- if (count < 1)
- die("empty alias for %s", alias_command);
-
- if (!strcmp(alias_command, new_argv[0]))
- die("recursive alias: %s", alias_command);
-
- new_argv = realloc(new_argv, sizeof(char *) *
- (count + *argcp + 1));
- /* insert after command name */
- memcpy(new_argv + count, *argv + 1, sizeof(char *) * *argcp);
- new_argv[count + *argcp] = NULL;
-
- *argv = new_argv;
- *argcp += count - 1;
-
- ret = 1;
- }
-
- errno = saved_errno;
-
- return ret;
-}
-
#define RUN_SETUP (1<<0)
#define USE_PAGER (1<<1)
@@ -339,13 +274,8 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
{
int status;
struct stat st;
- const char *prefix;
char sbuf[STRERR_BUFSIZE];
- prefix = NULL;
- if (p->option & RUN_SETUP)
- prefix = NULL; /* setup_perf_directory(); */
-
if (use_browser == -1)
use_browser = check_browser_config(p->cmd);
@@ -356,7 +286,7 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
commit_pager_choice();
perf_env__set_cmdline(&perf_env, argc, argv);
- status = p->fn(argc, argv, prefix);
+ status = p->fn(argc, argv);
perf_config__exit();
exit_browser(status);
perf_env__exit(&perf_env);
@@ -448,7 +378,8 @@ static void execv_dashed_external(const char **argv)
if (status != -ERR_RUN_COMMAND_EXEC) {
if (IS_RUN_COMMAND_ERR(status)) {
do_die:
- die("unable to run '%s'", argv[0]);
+ pr_err("FATAL: unable to run '%s'", argv[0]);
+ status = -128;
}
exit(-status);
}
@@ -460,25 +391,12 @@ do_die:
static int run_argv(int *argcp, const char ***argv)
{
- int done_alias = 0;
+ /* See if it's an internal command */
+ handle_internal_command(*argcp, *argv);
- while (1) {
- /* See if it's an internal command */
- handle_internal_command(*argcp, *argv);
-
- /* .. then try the external ones */
- execv_dashed_external(*argv);
-
- /* It could be an alias -- this works around the insanity
- * of overriding "perf log" with "perf show" by having
- * alias.log = show
- */
- if (done_alias || !handle_alias(argcp, argv))
- break;
- done_alias = 1;
- }
-
- return done_alias;
+ /* .. then try the external ones */
+ execv_dashed_external(*argv);
+ return 0;
}
static void pthread__block_sigwinch(void)
@@ -566,7 +484,7 @@ int main(int argc, const char **argv)
#ifdef HAVE_LIBAUDIT_SUPPORT
setup_path();
argv[0] = "trace";
- return cmd_trace(argc, argv, NULL);
+ return cmd_trace(argc, argv);
#else
fprintf(stderr,
"trace command not available: missing audit-libs devel package at build time.\n");
@@ -611,17 +529,12 @@ int main(int argc, const char **argv)
while (1) {
static int done_help;
- int was_alias = run_argv(&argc, &argv);
+
+ run_argv(&argc, &argv);
if (errno != ENOENT)
break;
- if (was_alias) {
- fprintf(stderr, "Expansion of alias '%s' failed; "
- "'%s' is not a perf-command\n",
- cmd, argv[0]);
- goto out;
- }
if (!done_help) {
cmd = argv[0] = help_unknown_cmd(cmd);
done_help = 1;
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 1c27d947c2fe..806c216a1078 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -50,6 +50,7 @@ struct record_opts {
bool running_time;
bool full_auxtrace;
bool auxtrace_snapshot_mode;
+ bool record_namespaces;
bool record_switch_events;
bool all_kernel;
bool all_user;
diff --git a/tools/perf/pmu-events/arch/x86/broadwell/uncore.json b/tools/perf/pmu-events/arch/x86/broadwell/uncore.json
new file mode 100644
index 000000000000..28e1e159a3cb
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/broadwell/uncore.json
@@ -0,0 +1,278 @@
+[
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x41",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_XCORE",
+ "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+ "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x81",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_EVICTION",
+ "BriefDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+ "PublicDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x44",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HIT_XCORE",
+ "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+ "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x48",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HITM_XCORE",
+ "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+ "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x11",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_M",
+ "BriefDescription": "L3 Lookup read request that access cache and found line in M-state",
+ "PublicDescription": "L3 Lookup read request that access cache and found line in M-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x21",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_M",
+ "BriefDescription": "L3 Lookup write request that access cache and found line in M-state",
+ "PublicDescription": "L3 Lookup write request that access cache and found line in M-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x81",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_M",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in M-state",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in M-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x18",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_I",
+ "BriefDescription": "L3 Lookup read request that access cache and found line in I-state",
+ "PublicDescription": "L3 Lookup read request that access cache and found line in I-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x88",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_I",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in I-state",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in I-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x1f",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_MESI",
+ "BriefDescription": "L3 Lookup read request that access cache and found line in any MESI-state",
+ "PublicDescription": "L3 Lookup read request that access cache and found line in any MESI-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x2f",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_MESI",
+ "BriefDescription": "L3 Lookup write request that access cache and found line in MESI-state",
+ "PublicDescription": "L3 Lookup write request that access cache and found line in MESI-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x8f",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_MESI",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in MESI-state",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in MESI-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x86",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_ES",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in E or S-state",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in E or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x16",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_ES",
+ "BriefDescription": "L3 Lookup read request that access cache and found line in E or S-state",
+ "PublicDescription": "L3 Lookup read request that access cache and found line in E or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x26",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_ES",
+ "BriefDescription": "L3 Lookup write request that access cache and found line in E or S-state",
+ "PublicDescription": "L3 Lookup write request that access cache and found line in E or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x80",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
+ "BriefDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from it's allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+ "PublicDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from it's allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+ "Counter": "0,",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x80",
+ "UMask": "0x02",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.DRD_DIRECT",
+ "BriefDescription": "Each cycle count number of 'valid' coherent Data Read entries that are in DirectData mode. Such entry is defined as valid when it is allocated till data sent to Core (first chunk, IDI0). Applicable for IA Cores' requests in normal case.",
+ "PublicDescription": "Each cycle count number of 'valid' coherent Data Read entries that are in DirectData mode. Such entry is defined as valid when it is allocated till data sent to Core (first chunk, IDI0). Applicable for IA Cores' requests in normal case.",
+ "Counter": "0,",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x81",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+ "BriefDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+ "PublicDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x81",
+ "UMask": "0x02",
+ "EventName": "UNC_ARB_TRK_REQUESTS.DRD_DIRECT",
+ "BriefDescription": "Number of Core coherent Data Read entries allocated in DirectData mode",
+ "PublicDescription": "Number of Core coherent Data Read entries allocated in DirectData mode.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x81",
+ "UMask": "0x20",
+ "EventName": "UNC_ARB_TRK_REQUESTS.WRITES",
+ "BriefDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+ "PublicDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x84",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+ "BriefDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+ "PublicDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x80",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
+ "BriefDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.;",
+ "PublicDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+ "Counter": "0,",
+ "CounterMask": "1",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "NCU",
+ "EventCode": "0x0",
+ "UMask": "0x01",
+ "EventName": "UNC_CLOCK.SOCKET",
+ "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles",
+ "PublicDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+ "Counter": "FIXED",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ }
+] \ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-cache.json b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-cache.json
index 076459c51d4e..58ed6d33d1f4 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-cache.json
@@ -1,13 +1,13 @@
[
{
- "BriefDescription": "Uncore cache clock ticks. Derived from unc_c_clockticks",
+ "BriefDescription": "Uncore cache clock ticks",
"Counter": "0,1,2,3",
"EventName": "UNC_C_CLOCKTICKS",
"PerPkg": "1",
"Unit": "CBO"
},
{
- "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch). Derived from unc_c_llc_lookup.any",
+ "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch)",
"Counter": "0,1,2,3",
"EventCode": "0x34",
"EventName": "UNC_C_LLC_LOOKUP.ANY",
@@ -18,7 +18,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "M line evictions from LLC (writebacks to memory). Derived from unc_c_llc_victims.m_state",
+ "BriefDescription": "M line evictions from LLC (writebacks to memory)",
"Counter": "0,1,2,3",
"EventCode": "0x37",
"EventName": "UNC_C_LLC_VICTIMS.M_STATE",
@@ -212,7 +212,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "read requests to home agent. Derived from unc_h_requests.reads",
+ "BriefDescription": "read requests to home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.READS",
@@ -221,7 +221,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "read requests to local home agent. Derived from unc_h_requests.reads_local",
+ "BriefDescription": "read requests to local home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.READS_LOCAL",
@@ -230,7 +230,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "read requests to remote home agent. Derived from unc_h_requests.reads_remote",
+ "BriefDescription": "read requests to remote home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.READS_REMOTE",
@@ -239,7 +239,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "write requests to home agent. Derived from unc_h_requests.writes",
+ "BriefDescription": "write requests to home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.WRITES",
@@ -248,7 +248,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "write requests to local home agent. Derived from unc_h_requests.writes_local",
+ "BriefDescription": "write requests to local home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.WRITES_LOCAL",
@@ -257,7 +257,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "write requests to remote home agent. Derived from unc_h_requests.writes_remote",
+ "BriefDescription": "write requests to remote home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.WRITES_REMOTE",
@@ -266,7 +266,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously). Derived from unc_h_snoop_resp.rspcnflct",
+ "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously)",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPCNFLCT",
@@ -275,7 +275,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "M line forwarded from remote cache along with writeback to memory. Derived from unc_h_snoop_resp.rsp_fwd_wb",
+ "BriefDescription": "M line forwarded from remote cache along with writeback to memory",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSP_FWD_WB",
@@ -285,7 +285,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "M line forwarded from remote cache with no writeback to memory. Derived from unc_h_snoop_resp.rspifwd",
+ "BriefDescription": "M line forwarded from remote cache with no writeback to memory",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPIFWD",
@@ -295,7 +295,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Shared line response from remote cache. Derived from unc_h_snoop_resp.rsps",
+ "BriefDescription": "Shared line response from remote cache",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPS",
@@ -305,7 +305,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Shared line forwarded from remote cache. Derived from unc_h_snoop_resp.rspsfwd",
+ "BriefDescription": "Shared line forwarded from remote cache",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPSFWD",
diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-memory.json b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-memory.json
index d17dc235f734..f4b0745cdbbf 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-memory.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-memory.json
@@ -3,7 +3,7 @@
"BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.RD",
+ "EventName": "LLC_MISSES.MEM_READ",
"PerPkg": "1",
"ScaleUnit": "64Bytes",
"UMask": "0x3",
@@ -13,48 +13,51 @@
"BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.WR",
+ "EventName": "LLC_MISSES.MEM_WRITE",
"PerPkg": "1",
"ScaleUnit": "64Bytes",
"UMask": "0xC",
"Unit": "iMC"
},
{
- "BriefDescription": "Memory controller clock ticks. Derived from unc_m_clockticks",
+ "BriefDescription": "Memory controller clock ticks",
"Counter": "0,1,2,3",
- "EventName": "UNC_M_CLOCKTICKS",
+ "EventName": "UNC_M_DCLOCKTICKS",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode. Derived from unc_m_power_channel_ppd",
+ "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode",
"Counter": "0,1,2,3",
"EventCode": "0x85",
"EventName": "UNC_M_POWER_CHANNEL_PPD",
- "MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_CLOCKTICKS) * 100.",
+ "MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_DCLOCKTICKS) * 100.",
+ "MetricName": "power_channel_ppd %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles all ranks are in critical thermal throttle. Derived from unc_m_power_critical_throttle_cycles",
+ "BriefDescription": "Cycles all ranks are in critical thermal throttle",
"Counter": "0,1,2,3",
"EventCode": "0x86",
"EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES",
- "MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_CLOCKTICKS) * 100.",
+ "MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_DCLOCKTICKS) * 100.",
+ "MetricName": "power_critical_throttle_cycles %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles Memory is in self refresh power mode. Derived from unc_m_power_self_refresh",
+ "BriefDescription": "Cycles Memory is in self refresh power mode",
"Counter": "0,1,2,3",
"EventCode": "0x43",
"EventName": "UNC_M_POWER_SELF_REFRESH",
- "MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_CLOCKTICKS) * 100.",
+ "MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_DCLOCKTICKS) * 100.",
+ "MetricName": "power_self_refresh %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Pre-charges due to page misses. Derived from unc_m_pre_count.page_miss",
+ "BriefDescription": "Pre-charges due to page misses",
"Counter": "0,1,2,3",
"EventCode": "0x2",
"EventName": "UNC_M_PRE_COUNT.PAGE_MISS",
@@ -63,7 +66,7 @@
"Unit": "iMC"
},
{
- "BriefDescription": "Pre-charge for reads. Derived from unc_m_pre_count.rd",
+ "BriefDescription": "Pre-charge for reads",
"Counter": "0,1,2,3",
"EventCode": "0x2",
"EventName": "UNC_M_PRE_COUNT.RD",
@@ -72,7 +75,7 @@
"Unit": "iMC"
},
{
- "BriefDescription": "Pre-charge for writes. Derived from unc_m_pre_count.wr",
+ "BriefDescription": "Pre-charge for writes",
"Counter": "0,1,2,3",
"EventCode": "0x2",
"EventName": "UNC_M_PRE_COUNT.WR",
diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json
index b44d43088bbb..dd1b95655d1d 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellde/uncore-power.json
@@ -1,83 +1,91 @@
[
{
- "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events. Derived from unc_p_clockticks",
+ "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events",
"Counter": "0,1,2,3",
"EventName": "UNC_P_CLOCKTICKS",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "C0 and C1. Derived from unc_p_power_state_occupancy.cores_c0",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
"Filter": "occ_sel=1",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C0 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c0 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "C3. Derived from unc_p_power_state_occupancy.cores_c3",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
"Filter": "occ_sel=2",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C3 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c3 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "C6 and C7. Derived from unc_p_power_state_occupancy.cores_c6",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events ",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
"Filter": "occ_sel=3",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C6 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c6 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "External Prochot. Derived from unc_p_prochot_external_cycles",
+ "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode. This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip",
"Counter": "0,1,2,3",
"EventCode": "0xA",
"EventName": "UNC_P_PROCHOT_EXTERNAL_CYCLES",
"MetricExpr": "(UNC_P_PROCHOT_EXTERNAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "prochot_external_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Thermal Strongest Upper Limit Cycles. Derived from unc_p_freq_max_limit_thermal_cycles",
+ "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x4",
"EventName": "UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_limit_thermal_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "OS Strongest Upper Limit Cycles. Derived from unc_p_freq_max_os_cycles",
+ "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x6",
"EventName": "UNC_P_FREQ_MAX_OS_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_OS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_os_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Power Strongest Upper Limit Cycles. Derived from unc_p_freq_max_power_cycles",
+ "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x5",
"EventName": "UNC_P_FREQ_MAX_POWER_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_POWER_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_power_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Cycles spent changing Frequency. Derived from unc_p_freq_trans_cycles",
+ "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x74",
"EventName": "UNC_P_FREQ_TRANS_CYCLES",
"MetricExpr": "(UNC_P_FREQ_TRANS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_trans_cycles %",
"PerPkg": "1",
"Unit": "PCU"
}
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json
index 076459c51d4e..58ed6d33d1f4 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json
@@ -1,13 +1,13 @@
[
{
- "BriefDescription": "Uncore cache clock ticks. Derived from unc_c_clockticks",
+ "BriefDescription": "Uncore cache clock ticks",
"Counter": "0,1,2,3",
"EventName": "UNC_C_CLOCKTICKS",
"PerPkg": "1",
"Unit": "CBO"
},
{
- "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch). Derived from unc_c_llc_lookup.any",
+ "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch)",
"Counter": "0,1,2,3",
"EventCode": "0x34",
"EventName": "UNC_C_LLC_LOOKUP.ANY",
@@ -18,7 +18,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "M line evictions from LLC (writebacks to memory). Derived from unc_c_llc_victims.m_state",
+ "BriefDescription": "M line evictions from LLC (writebacks to memory)",
"Counter": "0,1,2,3",
"EventCode": "0x37",
"EventName": "UNC_C_LLC_VICTIMS.M_STATE",
@@ -212,7 +212,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "read requests to home agent. Derived from unc_h_requests.reads",
+ "BriefDescription": "read requests to home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.READS",
@@ -221,7 +221,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "read requests to local home agent. Derived from unc_h_requests.reads_local",
+ "BriefDescription": "read requests to local home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.READS_LOCAL",
@@ -230,7 +230,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "read requests to remote home agent. Derived from unc_h_requests.reads_remote",
+ "BriefDescription": "read requests to remote home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.READS_REMOTE",
@@ -239,7 +239,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "write requests to home agent. Derived from unc_h_requests.writes",
+ "BriefDescription": "write requests to home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.WRITES",
@@ -248,7 +248,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "write requests to local home agent. Derived from unc_h_requests.writes_local",
+ "BriefDescription": "write requests to local home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.WRITES_LOCAL",
@@ -257,7 +257,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "write requests to remote home agent. Derived from unc_h_requests.writes_remote",
+ "BriefDescription": "write requests to remote home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.WRITES_REMOTE",
@@ -266,7 +266,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously). Derived from unc_h_snoop_resp.rspcnflct",
+ "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously)",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPCNFLCT",
@@ -275,7 +275,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "M line forwarded from remote cache along with writeback to memory. Derived from unc_h_snoop_resp.rsp_fwd_wb",
+ "BriefDescription": "M line forwarded from remote cache along with writeback to memory",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSP_FWD_WB",
@@ -285,7 +285,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "M line forwarded from remote cache with no writeback to memory. Derived from unc_h_snoop_resp.rspifwd",
+ "BriefDescription": "M line forwarded from remote cache with no writeback to memory",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPIFWD",
@@ -295,7 +295,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Shared line response from remote cache. Derived from unc_h_snoop_resp.rsps",
+ "BriefDescription": "Shared line response from remote cache",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPS",
@@ -305,7 +305,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Shared line forwarded from remote cache. Derived from unc_h_snoop_resp.rspsfwd",
+ "BriefDescription": "Shared line forwarded from remote cache",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPSFWD",
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json
index 39387f7909b2..824961318c1e 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json
@@ -1,6 +1,6 @@
[
{
- "BriefDescription": "QPI clock ticks. Derived from unc_q_clockticks",
+ "BriefDescription": "QPI clock ticks",
"Counter": "0,1,2,3",
"EventCode": "0x14",
"EventName": "UNC_Q_CLOCKTICKS",
@@ -10,7 +10,7 @@
{
"BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data",
"Counter": "0,1,2,3",
- "EventName": "UNC_Q_TxL_FLITS_G0.DATA",
+ "EventName": "QPI_DATA_BANDWIDTH_TX",
"PerPkg": "1",
"ScaleUnit": "8Bytes",
"UMask": "0x2",
@@ -19,7 +19,7 @@
{
"BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data",
"Counter": "0,1,2,3",
- "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA",
+ "EventName": "QPI_CTL_BANDWIDTH_TX",
"PerPkg": "1",
"ScaleUnit": "8Bytes",
"UMask": "0x4",
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json
index d17dc235f734..66eed399724c 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json
@@ -3,7 +3,7 @@
"BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.RD",
+ "EventName": "LLC_MISSES.MEM_READ",
"PerPkg": "1",
"ScaleUnit": "64Bytes",
"UMask": "0x3",
@@ -13,48 +13,51 @@
"BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.WR",
+ "EventName": "LLC_MISSES.MEM_WRITE",
"PerPkg": "1",
"ScaleUnit": "64Bytes",
"UMask": "0xC",
"Unit": "iMC"
},
{
- "BriefDescription": "Memory controller clock ticks. Derived from unc_m_clockticks",
+ "BriefDescription": "Memory controller clock ticks",
"Counter": "0,1,2,3",
"EventName": "UNC_M_CLOCKTICKS",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode. Derived from unc_m_power_channel_ppd",
+ "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode",
"Counter": "0,1,2,3",
"EventCode": "0x85",
"EventName": "UNC_M_POWER_CHANNEL_PPD",
"MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_channel_ppd %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles all ranks are in critical thermal throttle. Derived from unc_m_power_critical_throttle_cycles",
+ "BriefDescription": "Cycles all ranks are in critical thermal throttle",
"Counter": "0,1,2,3",
"EventCode": "0x86",
"EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES",
"MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_critical_throttle_cycles %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles Memory is in self refresh power mode. Derived from unc_m_power_self_refresh",
+ "BriefDescription": "Cycles Memory is in self refresh power mode",
"Counter": "0,1,2,3",
"EventCode": "0x43",
"EventName": "UNC_M_POWER_SELF_REFRESH",
"MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_self_refresh %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Pre-charges due to page misses. Derived from unc_m_pre_count.page_miss",
+ "BriefDescription": "Pre-charges due to page misses",
"Counter": "0,1,2,3",
"EventCode": "0x2",
"EventName": "UNC_M_PRE_COUNT.PAGE_MISS",
@@ -63,7 +66,7 @@
"Unit": "iMC"
},
{
- "BriefDescription": "Pre-charge for reads. Derived from unc_m_pre_count.rd",
+ "BriefDescription": "Pre-charge for reads",
"Counter": "0,1,2,3",
"EventCode": "0x2",
"EventName": "UNC_M_PRE_COUNT.RD",
@@ -72,7 +75,7 @@
"Unit": "iMC"
},
{
- "BriefDescription": "Pre-charge for writes. Derived from unc_m_pre_count.wr",
+ "BriefDescription": "Pre-charge for writes",
"Counter": "0,1,2,3",
"EventCode": "0x2",
"EventName": "UNC_M_PRE_COUNT.WR",
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json
index b44d43088bbb..dd1b95655d1d 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-power.json
@@ -1,83 +1,91 @@
[
{
- "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events. Derived from unc_p_clockticks",
+ "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events",
"Counter": "0,1,2,3",
"EventName": "UNC_P_CLOCKTICKS",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "C0 and C1. Derived from unc_p_power_state_occupancy.cores_c0",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
"Filter": "occ_sel=1",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C0 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c0 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "C3. Derived from unc_p_power_state_occupancy.cores_c3",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
"Filter": "occ_sel=2",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C3 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c3 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "C6 and C7. Derived from unc_p_power_state_occupancy.cores_c6",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events ",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
"Filter": "occ_sel=3",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C6 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c6 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "External Prochot. Derived from unc_p_prochot_external_cycles",
+ "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode. This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip",
"Counter": "0,1,2,3",
"EventCode": "0xA",
"EventName": "UNC_P_PROCHOT_EXTERNAL_CYCLES",
"MetricExpr": "(UNC_P_PROCHOT_EXTERNAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "prochot_external_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Thermal Strongest Upper Limit Cycles. Derived from unc_p_freq_max_limit_thermal_cycles",
+ "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x4",
"EventName": "UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_limit_thermal_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "OS Strongest Upper Limit Cycles. Derived from unc_p_freq_max_os_cycles",
+ "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x6",
"EventName": "UNC_P_FREQ_MAX_OS_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_OS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_os_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Power Strongest Upper Limit Cycles. Derived from unc_p_freq_max_power_cycles",
+ "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x5",
"EventName": "UNC_P_FREQ_MAX_POWER_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_POWER_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_power_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Cycles spent changing Frequency. Derived from unc_p_freq_trans_cycles",
+ "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x74",
"EventName": "UNC_P_FREQ_TRANS_CYCLES",
"MetricExpr": "(UNC_P_FREQ_TRANS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_trans_cycles %",
"PerPkg": "1",
"Unit": "PCU"
}
diff --git a/tools/perf/pmu-events/arch/x86/haswell/uncore.json b/tools/perf/pmu-events/arch/x86/haswell/uncore.json
new file mode 100644
index 000000000000..3ef5c21fef56
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/haswell/uncore.json
@@ -0,0 +1,374 @@
+[
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x21",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_EXTERNAL",
+ "BriefDescription": "An external snoop misses in some processor core.",
+ "PublicDescription": "An external snoop misses in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x41",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_XCORE",
+ "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+ "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x81",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_EVICTION",
+ "BriefDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+ "PublicDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x24",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HIT_EXTERNAL",
+ "BriefDescription": "An external snoop hits a non-modified line in some processor core.",
+ "PublicDescription": "An external snoop hits a non-modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x44",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HIT_XCORE",
+ "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+ "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x84",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HIT_EVICTION",
+ "BriefDescription": "A cross-core snoop resulted from L3 Eviction which hits a non-modified line in some processor core.",
+ "PublicDescription": "A cross-core snoop resulted from L3 Eviction which hits a non-modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x28",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HITM_EXTERNAL",
+ "BriefDescription": "An external snoop hits a modified line in some processor core.",
+ "PublicDescription": "An external snoop hits a modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x48",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HITM_XCORE",
+ "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+ "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x88",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HITM_EVICTION",
+ "BriefDescription": "A cross-core snoop resulted from L3 Eviction which hits a modified line in some processor core.",
+ "PublicDescription": "A cross-core snoop resulted from L3 Eviction which hits a modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x11",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_M",
+ "BriefDescription": "L3 Lookup read request that access cache and found line in M-state.",
+ "PublicDescription": "L3 Lookup read request that access cache and found line in M-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x21",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_M",
+ "BriefDescription": "L3 Lookup write request that access cache and found line in M-state.",
+ "PublicDescription": "L3 Lookup write request that access cache and found line in M-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x41",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_M",
+ "BriefDescription": "L3 Lookup external snoop request that access cache and found line in M-state.",
+ "PublicDescription": "L3 Lookup external snoop request that access cache and found line in M-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x81",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_M",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in M-state.",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in M-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x18",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_I",
+ "BriefDescription": "L3 Lookup read request that access cache and found line in I-state.",
+ "PublicDescription": "L3 Lookup read request that access cache and found line in I-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x28",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_I",
+ "BriefDescription": "L3 Lookup write request that access cache and found line in I-state.",
+ "PublicDescription": "L3 Lookup write request that access cache and found line in I-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x48",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_I",
+ "BriefDescription": "L3 Lookup external snoop request that access cache and found line in I-state.",
+ "PublicDescription": "L3 Lookup external snoop request that access cache and found line in I-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x88",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_I",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in I-state.",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in I-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x1f",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_MESI",
+ "BriefDescription": "L3 Lookup read request that access cache and found line in any MESI-state.",
+ "PublicDescription": "L3 Lookup read request that access cache and found line in any MESI-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x2f",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_MESI",
+ "BriefDescription": "L3 Lookup write request that access cache and found line in MESI-state.",
+ "PublicDescription": "L3 Lookup write request that access cache and found line in MESI-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x4f",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_MESI",
+ "BriefDescription": "L3 Lookup external snoop request that access cache and found line in MESI-state.",
+ "PublicDescription": "L3 Lookup external snoop request that access cache and found line in MESI-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x8f",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_MESI",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in MESI-state.",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in MESI-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x86",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_ES",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in E or S-state.",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in E or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x46",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_ES",
+ "BriefDescription": "L3 Lookup external snoop request that access cache and found line in E or S-state.",
+ "PublicDescription": "L3 Lookup external snoop request that access cache and found line in E or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x16",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_ES",
+ "BriefDescription": "L3 Lookup read request that access cache and found line in E or S-state.",
+ "PublicDescription": "L3 Lookup read request that access cache and found line in E or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x26",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_ES",
+ "BriefDescription": "L3 Lookup write request that access cache and found line in E or S-state.",
+ "PublicDescription": "L3 Lookup write request that access cache and found line in E or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x80",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
+ "BriefDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from it's allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+ "PublicDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from it's allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+ "Counter": "0",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x81",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+ "BriefDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+ "PublicDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x81",
+ "UMask": "0x20",
+ "EventName": "UNC_ARB_TRK_REQUESTS.WRITES",
+ "BriefDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+ "PublicDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x83",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_COH_TRK_OCCUPANCY.All",
+ "BriefDescription": "Each cycle count number of valid entries in Coherency Tracker queue from allocation till deallocation. Aperture requests (snoops) appear as NC decoded internally and become coherent (snoop L3, access memory)",
+ "PublicDescription": "Each cycle count number of valid entries in Coherency Tracker queue from allocation till deallocation. Aperture requests (snoops) appear as NC decoded internally and become coherent (snoop L3, access memory).",
+ "Counter": "0",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x84",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+ "BriefDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+ "PublicDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "NCU",
+ "EventCode": "0x0",
+ "UMask": "0x01",
+ "EventName": "UNC_CLOCK.SOCKET",
+ "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+ "PublicDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+ "Counter": "FIXED",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ }
+] \ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/x86/haswellx/uncore-cache.json b/tools/perf/pmu-events/arch/x86/haswellx/uncore-cache.json
index 076459c51d4e..58ed6d33d1f4 100644
--- a/tools/perf/pmu-events/arch/x86/haswellx/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/uncore-cache.json
@@ -1,13 +1,13 @@
[
{
- "BriefDescription": "Uncore cache clock ticks. Derived from unc_c_clockticks",
+ "BriefDescription": "Uncore cache clock ticks",
"Counter": "0,1,2,3",
"EventName": "UNC_C_CLOCKTICKS",
"PerPkg": "1",
"Unit": "CBO"
},
{
- "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch). Derived from unc_c_llc_lookup.any",
+ "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch)",
"Counter": "0,1,2,3",
"EventCode": "0x34",
"EventName": "UNC_C_LLC_LOOKUP.ANY",
@@ -18,7 +18,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "M line evictions from LLC (writebacks to memory). Derived from unc_c_llc_victims.m_state",
+ "BriefDescription": "M line evictions from LLC (writebacks to memory)",
"Counter": "0,1,2,3",
"EventCode": "0x37",
"EventName": "UNC_C_LLC_VICTIMS.M_STATE",
@@ -212,7 +212,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "read requests to home agent. Derived from unc_h_requests.reads",
+ "BriefDescription": "read requests to home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.READS",
@@ -221,7 +221,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "read requests to local home agent. Derived from unc_h_requests.reads_local",
+ "BriefDescription": "read requests to local home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.READS_LOCAL",
@@ -230,7 +230,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "read requests to remote home agent. Derived from unc_h_requests.reads_remote",
+ "BriefDescription": "read requests to remote home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.READS_REMOTE",
@@ -239,7 +239,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "write requests to home agent. Derived from unc_h_requests.writes",
+ "BriefDescription": "write requests to home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.WRITES",
@@ -248,7 +248,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "write requests to local home agent. Derived from unc_h_requests.writes_local",
+ "BriefDescription": "write requests to local home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.WRITES_LOCAL",
@@ -257,7 +257,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "write requests to remote home agent. Derived from unc_h_requests.writes_remote",
+ "BriefDescription": "write requests to remote home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.WRITES_REMOTE",
@@ -266,7 +266,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously). Derived from unc_h_snoop_resp.rspcnflct",
+ "BriefDescription": "Conflict requests (requests for same address from multiple agents simultaneously)",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPCNFLCT",
@@ -275,7 +275,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "M line forwarded from remote cache along with writeback to memory. Derived from unc_h_snoop_resp.rsp_fwd_wb",
+ "BriefDescription": "M line forwarded from remote cache along with writeback to memory",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSP_FWD_WB",
@@ -285,7 +285,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "M line forwarded from remote cache with no writeback to memory. Derived from unc_h_snoop_resp.rspifwd",
+ "BriefDescription": "M line forwarded from remote cache with no writeback to memory",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPIFWD",
@@ -295,7 +295,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Shared line response from remote cache. Derived from unc_h_snoop_resp.rsps",
+ "BriefDescription": "Shared line response from remote cache",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPS",
@@ -305,7 +305,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Shared line forwarded from remote cache. Derived from unc_h_snoop_resp.rspsfwd",
+ "BriefDescription": "Shared line forwarded from remote cache",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPSFWD",
diff --git a/tools/perf/pmu-events/arch/x86/haswellx/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/haswellx/uncore-interconnect.json
index 39387f7909b2..824961318c1e 100644
--- a/tools/perf/pmu-events/arch/x86/haswellx/uncore-interconnect.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/uncore-interconnect.json
@@ -1,6 +1,6 @@
[
{
- "BriefDescription": "QPI clock ticks. Derived from unc_q_clockticks",
+ "BriefDescription": "QPI clock ticks",
"Counter": "0,1,2,3",
"EventCode": "0x14",
"EventName": "UNC_Q_CLOCKTICKS",
@@ -10,7 +10,7 @@
{
"BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data",
"Counter": "0,1,2,3",
- "EventName": "UNC_Q_TxL_FLITS_G0.DATA",
+ "EventName": "QPI_DATA_BANDWIDTH_TX",
"PerPkg": "1",
"ScaleUnit": "8Bytes",
"UMask": "0x2",
@@ -19,7 +19,7 @@
{
"BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data",
"Counter": "0,1,2,3",
- "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA",
+ "EventName": "QPI_CTL_BANDWIDTH_TX",
"PerPkg": "1",
"ScaleUnit": "8Bytes",
"UMask": "0x4",
diff --git a/tools/perf/pmu-events/arch/x86/haswellx/uncore-memory.json b/tools/perf/pmu-events/arch/x86/haswellx/uncore-memory.json
index d17dc235f734..66eed399724c 100644
--- a/tools/perf/pmu-events/arch/x86/haswellx/uncore-memory.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/uncore-memory.json
@@ -3,7 +3,7 @@
"BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.RD",
+ "EventName": "LLC_MISSES.MEM_READ",
"PerPkg": "1",
"ScaleUnit": "64Bytes",
"UMask": "0x3",
@@ -13,48 +13,51 @@
"BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.WR",
+ "EventName": "LLC_MISSES.MEM_WRITE",
"PerPkg": "1",
"ScaleUnit": "64Bytes",
"UMask": "0xC",
"Unit": "iMC"
},
{
- "BriefDescription": "Memory controller clock ticks. Derived from unc_m_clockticks",
+ "BriefDescription": "Memory controller clock ticks",
"Counter": "0,1,2,3",
"EventName": "UNC_M_CLOCKTICKS",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode. Derived from unc_m_power_channel_ppd",
+ "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode",
"Counter": "0,1,2,3",
"EventCode": "0x85",
"EventName": "UNC_M_POWER_CHANNEL_PPD",
"MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_channel_ppd %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles all ranks are in critical thermal throttle. Derived from unc_m_power_critical_throttle_cycles",
+ "BriefDescription": "Cycles all ranks are in critical thermal throttle",
"Counter": "0,1,2,3",
"EventCode": "0x86",
"EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES",
"MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_critical_throttle_cycles %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles Memory is in self refresh power mode. Derived from unc_m_power_self_refresh",
+ "BriefDescription": "Cycles Memory is in self refresh power mode",
"Counter": "0,1,2,3",
"EventCode": "0x43",
"EventName": "UNC_M_POWER_SELF_REFRESH",
"MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_self_refresh %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Pre-charges due to page misses. Derived from unc_m_pre_count.page_miss",
+ "BriefDescription": "Pre-charges due to page misses",
"Counter": "0,1,2,3",
"EventCode": "0x2",
"EventName": "UNC_M_PRE_COUNT.PAGE_MISS",
@@ -63,7 +66,7 @@
"Unit": "iMC"
},
{
- "BriefDescription": "Pre-charge for reads. Derived from unc_m_pre_count.rd",
+ "BriefDescription": "Pre-charge for reads",
"Counter": "0,1,2,3",
"EventCode": "0x2",
"EventName": "UNC_M_PRE_COUNT.RD",
@@ -72,7 +75,7 @@
"Unit": "iMC"
},
{
- "BriefDescription": "Pre-charge for writes. Derived from unc_m_pre_count.wr",
+ "BriefDescription": "Pre-charge for writes",
"Counter": "0,1,2,3",
"EventCode": "0x2",
"EventName": "UNC_M_PRE_COUNT.WR",
diff --git a/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json b/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json
index b44d43088bbb..dd1b95655d1d 100644
--- a/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/haswellx/uncore-power.json
@@ -1,83 +1,91 @@
[
{
- "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events. Derived from unc_p_clockticks",
+ "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events",
"Counter": "0,1,2,3",
"EventName": "UNC_P_CLOCKTICKS",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "C0 and C1. Derived from unc_p_power_state_occupancy.cores_c0",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
"Filter": "occ_sel=1",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C0 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c0 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "C3. Derived from unc_p_power_state_occupancy.cores_c3",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
"Filter": "occ_sel=2",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C3 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c3 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "C6 and C7. Derived from unc_p_power_state_occupancy.cores_c6",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events ",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
"Filter": "occ_sel=3",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C6 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c6 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "External Prochot. Derived from unc_p_prochot_external_cycles",
+ "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode. This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip",
"Counter": "0,1,2,3",
"EventCode": "0xA",
"EventName": "UNC_P_PROCHOT_EXTERNAL_CYCLES",
"MetricExpr": "(UNC_P_PROCHOT_EXTERNAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "prochot_external_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Thermal Strongest Upper Limit Cycles. Derived from unc_p_freq_max_limit_thermal_cycles",
+ "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x4",
"EventName": "UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_limit_thermal_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "OS Strongest Upper Limit Cycles. Derived from unc_p_freq_max_os_cycles",
+ "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x6",
"EventName": "UNC_P_FREQ_MAX_OS_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_OS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_os_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Power Strongest Upper Limit Cycles. Derived from unc_p_freq_max_power_cycles",
+ "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x5",
"EventName": "UNC_P_FREQ_MAX_POWER_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_POWER_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_power_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Cycles spent changing Frequency. Derived from unc_p_freq_trans_cycles",
+ "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x74",
"EventName": "UNC_P_FREQ_TRANS_CYCLES",
"MetricExpr": "(UNC_P_FREQ_TRANS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_trans_cycles %",
"PerPkg": "1",
"Unit": "PCU"
}
diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/uncore.json b/tools/perf/pmu-events/arch/x86/ivybridge/uncore.json
new file mode 100644
index 000000000000..42c70eed05a2
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/ivybridge/uncore.json
@@ -0,0 +1,314 @@
+[
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x01",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.MISS",
+ "BriefDescription": "A snoop misses in some processor core.",
+ "PublicDescription": "A snoop misses in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x02",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.INVAL",
+ "BriefDescription": "A snoop invalidates a non-modified line in some processor core.",
+ "PublicDescription": "A snoop invalidates a non-modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x04",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HIT",
+ "BriefDescription": "A snoop hits a non-modified line in some processor core.",
+ "PublicDescription": "A snoop hits a non-modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x08",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HITM",
+ "BriefDescription": "A snoop hits a modified line in some processor core.",
+ "PublicDescription": "A snoop hits a modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x10",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.INVAL_M",
+ "BriefDescription": "A snoop invalidates a modified line in some processor core.",
+ "PublicDescription": "A snoop invalidates a modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x20",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.EXTERNAL_FILTER",
+ "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to external snoop request.",
+ "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to external snoop request.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x40",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.XCORE_FILTER",
+ "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to processor core memory request.",
+ "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to processor core memory request.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x80",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.EVICTION_FILTER",
+ "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to LLC eviction.",
+ "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to LLC eviction.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x01",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.M",
+ "BriefDescription": "LLC lookup request that access cache and found line in M-state.",
+ "PublicDescription": "LLC lookup request that access cache and found line in M-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x02",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.E",
+ "BriefDescription": "LLC lookup request that access cache and found line in E-state.",
+ "PublicDescription": "LLC lookup request that access cache and found line in E-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x04",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.S",
+ "BriefDescription": "LLC lookup request that access cache and found line in S-state.",
+ "PublicDescription": "LLC lookup request that access cache and found line in S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x08",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.I",
+ "BriefDescription": "LLC lookup request that access cache and found line in I-state.",
+ "PublicDescription": "LLC lookup request that access cache and found line in I-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x10",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_FILTER",
+ "BriefDescription": "Filter on processor core initiated cacheable read requests.",
+ "PublicDescription": "Filter on processor core initiated cacheable read requests.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x20",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_FILTER",
+ "BriefDescription": "Filter on processor core initiated cacheable write requests.",
+ "PublicDescription": "Filter on processor core initiated cacheable write requests.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x40",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_FILTER",
+ "BriefDescription": "Filter on external snoop requests.",
+ "PublicDescription": "Filter on external snoop requests.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x80",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_REQUEST_FILTER",
+ "BriefDescription": "Filter on any IRQ or IPQ initiated requests including uncacheable, non-coherent requests.",
+ "PublicDescription": "Filter on any IRQ or IPQ initiated requests including uncacheable, non-coherent requests.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x80",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
+ "BriefDescription": "Counts cycles weighted by the number of requests waiting for data returning from the memory controller. Accounts for coherent and non-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+ "PublicDescription": "Counts cycles weighted by the number of requests waiting for data returning from the memory controller. Accounts for coherent and non-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+ "Counter": "0",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x81",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+ "BriefDescription": "Counts the number of coherent and in-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+ "PublicDescription": "Counts the number of coherent and in-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x81",
+ "UMask": "0x20",
+ "EventName": "UNC_ARB_TRK_REQUESTS.WRITES",
+ "BriefDescription": "Counts the number of allocated write entries, include full, partial, and LLC evictions.",
+ "PublicDescription": "Counts the number of allocated write entries, include full, partial, and LLC evictions.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x81",
+ "UMask": "0x80",
+ "EventName": "UNC_ARB_TRK_REQUESTS.EVICTIONS",
+ "BriefDescription": "Counts the number of LLC evictions allocated.",
+ "PublicDescription": "Counts the number of LLC evictions allocated.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x83",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_COH_TRK_OCCUPANCY.ALL",
+ "BriefDescription": "Cycles weighted by number of requests pending in Coherency Tracker.",
+ "PublicDescription": "Cycles weighted by number of requests pending in Coherency Tracker.",
+ "Counter": "0",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x84",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+ "BriefDescription": "Number of requests allocated in Coherency Tracker.",
+ "PublicDescription": "Number of requests allocated in Coherency Tracker.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x80",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
+ "BriefDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+ "PublicDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+ "Counter": "0,1",
+ "CounterMask": "1",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x80",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_OVER_HALF_FULL",
+ "BriefDescription": "Cycles with at least half of the requests outstanding are waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+ "PublicDescription": "Cycles with at least half of the requests outstanding are waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+ "Counter": "0,1",
+ "CounterMask": "10",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x0",
+ "UMask": "0x01",
+ "EventName": "UNC_CLOCK.SOCKET",
+ "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+ "PublicDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+ "Counter": "Fixed",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x06",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ES",
+ "BriefDescription": "LLC lookup request that access cache and found line in E-state or S-state.",
+ "PublicDescription": "LLC lookup request that access cache and found line in E-state or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ }
+] \ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-cache.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-cache.json
index 2efdc6772e0b..267410594833 100644
--- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-cache.json
@@ -1,13 +1,13 @@
[
{
- "BriefDescription": "Uncore cache clock ticks. Derived from unc_c_clockticks",
+ "BriefDescription": "Uncore cache clock ticks",
"Counter": "0,1,2,3",
"EventName": "UNC_C_CLOCKTICKS",
"PerPkg": "1",
"Unit": "CBO"
},
{
- "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch). Derived from unc_c_llc_lookup.any",
+ "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch)",
"Counter": "0,1",
"EventCode": "0x34",
"EventName": "UNC_C_LLC_LOOKUP.ANY",
@@ -18,7 +18,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "M line evictions from LLC (writebacks to memory). Derived from unc_c_llc_victims.m_state",
+ "BriefDescription": "M line evictions from LLC (writebacks to memory)",
"Counter": "0,1",
"EventCode": "0x37",
"EventName": "UNC_C_LLC_VICTIMS.M_STATE",
@@ -237,7 +237,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "Occupancy for all LLC misses that are addressed to local memory. Derived from unc_c_tor_occupancy.miss_local",
+ "BriefDescription": "Occupancy for all LLC misses that are addressed to local memory",
"EventCode": "0x36",
"EventName": "UNC_C_TOR_OCCUPANCY.MISS_LOCAL",
"PerPkg": "1",
@@ -254,7 +254,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "Occupancy for all LLC misses that are addressed to remote memory. Derived from unc_c_tor_occupancy.miss_remote",
+ "BriefDescription": "Occupancy for all LLC misses that are addressed to remote memory",
"EventCode": "0x36",
"EventName": "UNC_C_TOR_OCCUPANCY.MISS_REMOTE",
"PerPkg": "1",
@@ -262,7 +262,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "Read requests to home agent. Derived from unc_h_requests.reads",
+ "BriefDescription": "Read requests to home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.READS",
@@ -271,7 +271,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Write requests to home agent. Derived from unc_h_requests.writes",
+ "BriefDescription": "Write requests to home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.WRITES",
@@ -280,7 +280,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "M line forwarded from remote cache along with writeback to memory. Derived from unc_h_snoop_resp.rsp_fwd_wb",
+ "BriefDescription": "M line forwarded from remote cache along with writeback to memory",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSP_FWD_WB",
@@ -290,7 +290,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "M line forwarded from remote cache with no writeback to memory. Derived from unc_h_snoop_resp.rspifwd",
+ "BriefDescription": "M line forwarded from remote cache with no writeback to memory",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPIFWD",
@@ -300,7 +300,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Shared line response from remote cache. Derived from unc_h_snoop_resp.rsps",
+ "BriefDescription": "Shared line response from remote cache",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPS",
@@ -310,7 +310,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "Shared line forwarded from remote cache. Derived from unc_h_snoop_resp.rspsfwd",
+ "BriefDescription": "Shared line forwarded from remote cache",
"Counter": "0,1,2,3",
"EventCode": "0x21",
"EventName": "UNC_H_SNOOP_RESP.RSPSFWD",
diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-interconnect.json
index d7e2fda1d695..b798a860bc81 100644
--- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-interconnect.json
+++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-interconnect.json
@@ -1,6 +1,6 @@
[
{
- "BriefDescription": "QPI clock ticks. Use to get percentages for QPI cycles events. Derived from unc_q_clockticks",
+ "BriefDescription": "QPI clock ticks. Use to get percentages for QPI cycles events",
"Counter": "0,1,2,3",
"EventCode": "0x14",
"EventName": "UNC_Q_CLOCKTICKS",
@@ -8,25 +8,27 @@
"Unit": "QPI LL"
},
{
- "BriefDescription": "Cycles where receiving QPI link is in half-width mode. Derived from unc_q_rxl0p_power_cycles",
+ "BriefDescription": "Cycles where receiving QPI link is in half-width mode",
"Counter": "0,1,2,3",
"EventCode": "0x10",
"EventName": "UNC_Q_RxL0P_POWER_CYCLES",
"MetricExpr": "(UNC_Q_RxL0P_POWER_CYCLES / UNC_Q_CLOCKTICKS) * 100.",
+ "MetricName": "rxl0p_power_cycles %",
"PerPkg": "1",
"Unit": "QPI LL"
},
{
- "BriefDescription": "Cycles where transmitting QPI link is in half-width mode. Derived from unc_q_txl0p_power_cycles",
+ "BriefDescription": "Cycles where transmitting QPI link is in half-width mode",
"Counter": "0,1,2,3",
"EventCode": "0xd",
"EventName": "UNC_Q_TxL0P_POWER_CYCLES",
"MetricExpr": "(UNC_Q_TxL0P_POWER_CYCLES / UNC_Q_CLOCKTICKS) * 100.",
+ "MetricName": "txl0p_power_cycles %",
"PerPkg": "1",
"Unit": "QPI LL"
},
{
- "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data",
+ "BriefDescription": "Number of data flits transmitted ",
"Counter": "0,1,2,3",
"EventName": "UNC_Q_TxL_FLITS_G0.DATA",
"PerPkg": "1",
@@ -35,7 +37,7 @@
"Unit": "QPI LL"
},
{
- "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data",
+ "BriefDescription": "Number of non data (control) flits transmitted ",
"Counter": "0,1,2,3",
"EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA",
"PerPkg": "1",
diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-memory.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-memory.json
index ac4ad4d6357b..df4b43294fa0 100644
--- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-memory.json
+++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-memory.json
@@ -1,6 +1,6 @@
[
{
- "BriefDescription": "Memory page activates for reads and writes. Derived from unc_m_act_count.rd",
+ "BriefDescription": "Memory page activates for reads and writes",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_M_ACT_COUNT.RD",
@@ -13,7 +13,7 @@
"BriefDescription": "Read requests to memory controller. Derived from unc_m_cas_count.rd",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.RD",
+ "EventName": "LLC_MISSES.MEM_READ",
"PerPkg": "1",
"ScaleUnit": "64Bytes",
"UMask": "0x3",
@@ -23,48 +23,51 @@
"BriefDescription": "Write requests to memory controller. Derived from unc_m_cas_count.wr",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.WR",
+ "EventName": "LLC_MISSES.MEM_WRITE",
"PerPkg": "1",
"ScaleUnit": "64Bytes",
"UMask": "0xC",
"Unit": "iMC"
},
{
- "BriefDescription": "Memory controller clock ticks. Use to generate percentages for memory controller CYCLES events. Derived from unc_m_clockticks",
+ "BriefDescription": "Memory controller clock ticks. Use to generate percentages for memory controller CYCLES events",
"Counter": "0,1,2,3",
"EventName": "UNC_M_CLOCKTICKS",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode. Derived from unc_m_power_channel_ppd",
+ "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode",
"Counter": "0,1,2,3",
"EventCode": "0x85",
"EventName": "UNC_M_POWER_CHANNEL_PPD",
"MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_channel_ppd %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles all ranks are in critical thermal throttle. Derived from unc_m_power_critical_throttle_cycles",
+ "BriefDescription": "Cycles all ranks are in critical thermal throttle",
"Counter": "0,1,2,3",
"EventCode": "0x86",
"EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES",
"MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_critical_throttle_cycles %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles Memory is in self refresh power mode. Derived from unc_m_power_self_refresh",
+ "BriefDescription": "Cycles Memory is in self refresh power mode",
"Counter": "0,1,2,3",
"EventCode": "0x43",
"EventName": "UNC_M_POWER_SELF_REFRESH",
"MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_self_refresh %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Memory page conflicts. Derived from unc_m_pre_count.page_miss",
+ "BriefDescription": "Memory page conflicts",
"Counter": "0,1,2,3",
"EventCode": "0x2",
"EventName": "UNC_M_PRE_COUNT.PAGE_MISS",
diff --git a/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json b/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json
index dc2586db0dfc..d40498f2cb1e 100644
--- a/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json
@@ -1,44 +1,48 @@
[
{
- "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events. Derived from unc_p_clockticks",
+ "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events",
"Counter": "0,1,2,3",
"EventName": "UNC_P_CLOCKTICKS",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band0=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band0_cycles",
+ "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band0=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
"Counter": "0,1,2,3",
"EventCode": "0xb",
"EventName": "UNC_P_FREQ_BAND0_CYCLES",
"MetricExpr": "(UNC_P_FREQ_BAND0_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band0_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band1=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band1_cycles",
+ "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band1=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
"Counter": "0,1,2,3",
"EventCode": "0xc",
"EventName": "UNC_P_FREQ_BAND1_CYCLES",
"MetricExpr": "(UNC_P_FREQ_BAND1_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band1_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band2=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band2_cycles",
+ "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band2=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
"Counter": "0,1,2,3",
"EventCode": "0xd",
"EventName": "UNC_P_FREQ_BAND2_CYCLES",
"MetricExpr": "(UNC_P_FREQ_BAND2_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band2_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band3=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band3_cycles",
+ "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band3=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
"Counter": "0,1,2,3",
"EventCode": "0xe",
"EventName": "UNC_P_FREQ_BAND3_CYCLES",
"MetricExpr": "(UNC_P_FREQ_BAND3_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band3_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -49,6 +53,7 @@
"EventName": "UNC_P_FREQ_BAND0_TRANSITIONS",
"Filter": "edge=1",
"MetricExpr": "(UNC_P_FREQ_BAND0_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band0_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -59,6 +64,7 @@
"EventName": "UNC_P_FREQ_BAND1_TRANSITIONS",
"Filter": "edge=1",
"MetricExpr": "(UNC_P_FREQ_BAND1_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band1_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -69,6 +75,7 @@
"EventName": "UNC_P_FREQ_BAND2_TRANSITIONS",
"Filter": "edge=1",
"MetricExpr": "(UNC_P_FREQ_BAND2_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band2_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -79,90 +86,100 @@
"EventName": "UNC_P_FREQ_BAND3_TRANSITIONS",
"Filter": "edge=1",
"MetricExpr": "(UNC_P_FREQ_BAND3_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band3_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details. Derived from unc_p_power_state_occupancy.cores_c0",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
"Filter": "occ_sel=1",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C0 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c0 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details. Derived from unc_p_power_state_occupancy.cores_c3",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
"Filter": "occ_sel=2",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C3 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c3 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details. Derived from unc_p_power_state_occupancy.cores_c6",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events ",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
"Filter": "occ_sel=3",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C6 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c6 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode. This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip. Derived from unc_p_prochot_external_cycles",
+ "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode. This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip",
"Counter": "0,1,2,3",
"EventCode": "0xa",
"EventName": "UNC_P_PROCHOT_EXTERNAL_CYCLES",
"MetricExpr": "(UNC_P_PROCHOT_EXTERNAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "prochot_external_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles when thermal conditions are the upper limit on frequency. This is related to the THERMAL_THROTTLE CYCLES_ABOVE_TEMP event, which always counts cycles when we are above the thermal temperature. This event (STRONGEST_UPPER_LIMIT) is sampled at the output of the algorithm that determines the actual frequency, while THERMAL_THROTTLE looks at the input. Derived from unc_p_freq_max_limit_thermal_cycles",
+ "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x4",
"EventName": "UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_limit_thermal_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency. Derived from unc_p_freq_max_os_cycles",
+ "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x6",
"EventName": "UNC_P_FREQ_MAX_OS_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_OS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_os_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency. Derived from unc_p_freq_max_power_cycles",
+ "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x5",
"EventName": "UNC_P_FREQ_MAX_POWER_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_POWER_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_power_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency. Derived from unc_p_freq_max_current_cycles",
+ "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x7",
"EventName": "UNC_P_FREQ_MAX_CURRENT_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_CURRENT_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_current_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles when the system is changing frequency. This can not be filtered by thread ID. One can also use it with the occupancy counter that monitors number of threads in C0 to estimate the performance impact that frequency transitions had on the system. Derived from unc_p_freq_trans_cycles",
+ "BriefDescription": "Cycles spent changing Frequency",
"Counter": "0,1,2,3",
"EventCode": "0x60",
"EventName": "UNC_P_FREQ_TRANS_CYCLES",
"MetricExpr": "(UNC_P_FREQ_TRANS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_trans_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -173,6 +190,7 @@
"EventName": "UNC_P_FREQ_GE_1200MHZ_CYCLES",
"Filter": "filter_band0=1200",
"MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_1200mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -183,6 +201,7 @@
"EventName": "UNC_P_FREQ_GE_2000MHZ_CYCLES",
"Filter": "filter_band1=2000",
"MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_2000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -193,6 +212,7 @@
"EventName": "UNC_P_FREQ_GE_3000MHZ_CYCLES",
"Filter": "filter_band2=3000",
"MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_3000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -203,6 +223,7 @@
"EventName": "UNC_P_FREQ_GE_4000MHZ_CYCLES",
"Filter": "filter_band3=4000",
"MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_4000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -213,6 +234,7 @@
"EventName": "UNC_P_FREQ_GE_1200MHZ_TRANSITIONS",
"Filter": "edge=1,filter_band0=1200",
"MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_1200mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -223,6 +245,7 @@
"EventName": "UNC_P_FREQ_GE_2000MHZ_TRANSITIONS",
"Filter": "edge=1,filter_band1=2000",
"MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_2000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -233,6 +256,7 @@
"EventName": "UNC_P_FREQ_GE_3000MHZ_TRANSITIONS",
"Filter": "edge=1,filter_band2=4000",
"MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_3000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -243,6 +267,7 @@
"EventName": "UNC_P_FREQ_GE_4000MHZ_TRANSITIONS",
"Filter": "edge=1,filter_band3=4000",
"MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_4000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
}
diff --git a/tools/perf/pmu-events/arch/x86/jaketown/uncore-cache.json b/tools/perf/pmu-events/arch/x86/jaketown/uncore-cache.json
index 2f23cf0129e7..3fa61d962607 100644
--- a/tools/perf/pmu-events/arch/x86/jaketown/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/uncore-cache.json
@@ -1,13 +1,13 @@
[
{
- "BriefDescription": "Uncore cache clock ticks. Derived from unc_c_clockticks",
+ "BriefDescription": "Uncore cache clock ticks",
"Counter": "0,1,2,3",
"EventName": "UNC_C_CLOCKTICKS",
"PerPkg": "1",
"Unit": "CBO"
},
{
- "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch). Derived from unc_c_llc_lookup.any",
+ "BriefDescription": "All LLC Misses (code+ data rd + data wr - including demand and prefetch)",
"Counter": "0,1",
"EventCode": "0x34",
"EventName": "UNC_C_LLC_LOOKUP.ANY",
@@ -18,7 +18,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "M line evictions from LLC (writebacks to memory). Derived from unc_c_llc_victims.m_state",
+ "BriefDescription": "M line evictions from LLC (writebacks to memory)",
"Counter": "0,1",
"EventCode": "0x37",
"EventName": "UNC_C_LLC_VICTIMS.M_STATE",
@@ -171,11 +171,12 @@
"Unit": "CBO"
},
{
- "BriefDescription": "Occupancy counter for all LLC misses; we divide this by UNC_C_CLOCKTICKS to get average Q depth. Derived from unc_c_tor_occupancy.miss_all",
+ "BriefDescription": "Occupancy counter for all LLC misses; we divide this by UNC_C_CLOCKTICKS to get average Q depth",
"EventCode": "0x36",
"EventName": "UNC_C_TOR_OCCUPANCY.MISS_ALL",
"Filter": "filter_opc=0x182",
"MetricExpr": "(UNC_C_TOR_OCCUPANCY.MISS_ALL / UNC_C_CLOCKTICKS) * 100.",
+ "MetricName": "tor_occupancy.miss_all %",
"PerPkg": "1",
"UMask": "0xa",
"Unit": "CBO"
@@ -189,7 +190,7 @@
"Unit": "CBO"
},
{
- "BriefDescription": "read requests to home agent. Derived from unc_h_requests.reads",
+ "BriefDescription": "read requests to home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.READS",
@@ -198,7 +199,7 @@
"Unit": "HA"
},
{
- "BriefDescription": "write requests to home agent. Derived from unc_h_requests.writes",
+ "BriefDescription": "write requests to home agent",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_H_REQUESTS.WRITES",
diff --git a/tools/perf/pmu-events/arch/x86/jaketown/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/jaketown/uncore-interconnect.json
index 63351876eb57..1b53c0e609e3 100644
--- a/tools/perf/pmu-events/arch/x86/jaketown/uncore-interconnect.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/uncore-interconnect.json
@@ -1,6 +1,6 @@
[
{
- "BriefDescription": "QPI clock ticks. Used to get percentages of QPI cycles events. Derived from unc_q_clockticks",
+ "BriefDescription": "QPI clock ticks. Used to get percentages of QPI cycles events",
"Counter": "0,1,2,3",
"EventCode": "0x14",
"EventName": "UNC_Q_CLOCKTICKS",
@@ -8,25 +8,27 @@
"Unit": "QPI LL"
},
{
- "BriefDescription": "Cycles where receiving QPI link is in half-width mode. Derived from unc_q_rxl0p_power_cycles",
+ "BriefDescription": "Cycles where receiving QPI link is in half-width mode",
"Counter": "0,1,2,3",
"EventCode": "0x10",
"EventName": "UNC_Q_RxL0P_POWER_CYCLES",
"MetricExpr": "(UNC_Q_RxL0P_POWER_CYCLES / UNC_Q_CLOCKTICKS) * 100.",
+ "MetricName": "rxl0p_power_cycles %",
"PerPkg": "1",
"Unit": "QPI LL"
},
{
- "BriefDescription": "Cycles where transmitting QPI link is in half-width mode. Derived from unc_q_txl0p_power_cycles",
+ "BriefDescription": "Cycles where transmitting QPI link is in half-width mode",
"Counter": "0,1,2,3",
"EventCode": "0xd",
"EventName": "UNC_Q_TxL0P_POWER_CYCLES",
"MetricExpr": "(UNC_Q_TxL0P_POWER_CYCLES / UNC_Q_CLOCKTICKS) * 100.",
+ "MetricName": "txl0p_power_cycles %",
"PerPkg": "1",
"Unit": "QPI LL"
},
{
- "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data",
+ "BriefDescription": "Number of data flits transmitted ",
"Counter": "0,1,2,3",
"EventName": "UNC_Q_TxL_FLITS_G0.DATA",
"PerPkg": "1",
@@ -35,7 +37,7 @@
"Unit": "QPI LL"
},
{
- "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data",
+ "BriefDescription": "Number of non data (control) flits transmitted ",
"Counter": "0,1,2,3",
"EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA",
"PerPkg": "1",
diff --git a/tools/perf/pmu-events/arch/x86/jaketown/uncore-memory.json b/tools/perf/pmu-events/arch/x86/jaketown/uncore-memory.json
index e2cf6daa7b37..8551cebeba23 100644
--- a/tools/perf/pmu-events/arch/x86/jaketown/uncore-memory.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/uncore-memory.json
@@ -1,6 +1,6 @@
[
{
- "BriefDescription": "Memory page activates. Derived from unc_m_act_count",
+ "BriefDescription": "Memory page activates",
"Counter": "0,1,2,3",
"EventCode": "0x1",
"EventName": "UNC_M_ACT_COUNT",
@@ -11,7 +11,7 @@
"BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.RD",
+ "EventName": "LLC_MISSES.MEM_READ",
"PerPkg": "1",
"UMask": "0x3",
"Unit": "iMC"
@@ -20,47 +20,50 @@
"BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.WR",
+ "EventName": "LLC_MISSES.MEM_WRITE",
"PerPkg": "1",
"UMask": "0xc",
"Unit": "iMC"
},
{
- "BriefDescription": "Memory controller clock ticks. Used to get percentages of memory controller cycles events. Derived from unc_m_clockticks",
+ "BriefDescription": "Memory controller clock ticks. Used to get percentages of memory controller cycles events",
"Counter": "0,1,2,3",
"EventName": "UNC_M_CLOCKTICKS",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode. Derived from unc_m_power_channel_ppd",
+ "BriefDescription": "Cycles where DRAM ranks are in power down (CKE) mode",
"Counter": "0,1,2,3",
"EventCode": "0x85",
"EventName": "UNC_M_POWER_CHANNEL_PPD",
"MetricExpr": "(UNC_M_POWER_CHANNEL_PPD / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_channel_ppd %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles all ranks are in critical thermal throttle. Derived from unc_m_power_critical_throttle_cycles",
+ "BriefDescription": "Cycles all ranks are in critical thermal throttle",
"Counter": "0,1,2,3",
"EventCode": "0x86",
"EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES",
"MetricExpr": "(UNC_M_POWER_CRITICAL_THROTTLE_CYCLES / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_critical_throttle_cycles %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Cycles Memory is in self refresh power mode. Derived from unc_m_power_self_refresh",
+ "BriefDescription": "Cycles Memory is in self refresh power mode",
"Counter": "0,1,2,3",
"EventCode": "0x43",
"EventName": "UNC_M_POWER_SELF_REFRESH",
"MetricExpr": "(UNC_M_POWER_SELF_REFRESH / UNC_M_CLOCKTICKS) * 100.",
+ "MetricName": "power_self_refresh %",
"PerPkg": "1",
"Unit": "iMC"
},
{
- "BriefDescription": "Memory page conflicts. Derived from unc_m_pre_count.page_miss",
+ "BriefDescription": "Memory page conflicts",
"Counter": "0,1,2,3",
"EventCode": "0x2",
"EventName": "UNC_M_PRE_COUNT.PAGE_MISS",
@@ -69,7 +72,7 @@
"Unit": "iMC"
},
{
- "BriefDescription": "Occupancy counter for memory read queue. Derived from unc_m_rpq_occupancy",
+ "BriefDescription": "Occupancy counter for memory read queue",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_M_RPQ_OCCUPANCY",
diff --git a/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json b/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json
index bbe36d547386..16034bfd06dd 100644
--- a/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json
+++ b/tools/perf/pmu-events/arch/x86/jaketown/uncore-power.json
@@ -1,44 +1,48 @@
[
{
- "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events. Derived from unc_p_clockticks",
+ "BriefDescription": "PCU clock ticks. Use to get percentages of PCU cycles events",
"Counter": "0,1,2,3",
"EventName": "UNC_P_CLOCKTICKS",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band0=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band0_cycles",
+ "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band0=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
"Counter": "0,1,2,3",
"EventCode": "0xb",
"EventName": "UNC_P_FREQ_BAND0_CYCLES",
"MetricExpr": "(UNC_P_FREQ_BAND0_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band0_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band1=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band1_cycles",
+ "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band1=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
"Counter": "0,1,2,3",
"EventCode": "0xc",
"EventName": "UNC_P_FREQ_BAND1_CYCLES",
"MetricExpr": "(UNC_P_FREQ_BAND1_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band1_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band2=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band2_cycles",
+ "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band2=XXX with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
"Counter": "0,1,2,3",
"EventCode": "0xd",
"EventName": "UNC_P_FREQ_BAND2_CYCLES",
"MetricExpr": "(UNC_P_FREQ_BAND2_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band2_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band3=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency. Derived from unc_p_freq_band3_cycles",
+ "BriefDescription": "Counts the number of cycles that the uncore was running at a frequency greater than or equal to the frequency that is configured in the filter. (filter_band3=XXX, with XXX in 100Mhz units). One can also use inversion (filter_inv=1) to track cycles when we were less than the configured frequency",
"Counter": "0,1,2,3",
"EventCode": "0xe",
"EventName": "UNC_P_FREQ_BAND3_CYCLES",
"MetricExpr": "(UNC_P_FREQ_BAND3_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band3_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -49,6 +53,7 @@
"EventName": "UNC_P_FREQ_BAND0_TRANSITIONS",
"Filter": "edge=1",
"MetricExpr": "(UNC_P_FREQ_BAND0_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band0_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -59,6 +64,7 @@
"EventName": "UNC_P_FREQ_BAND1_TRANSITIONS",
"Filter": "edge=1",
"MetricExpr": "(UNC_P_FREQ_BAND1_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band1_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -69,6 +75,7 @@
"EventName": "UNC_P_FREQ_BAND2_TRANSITIONS",
"Filter": "edge=1",
"MetricExpr": "(UNC_P_FREQ_BAND2_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band2_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -79,89 +86,99 @@
"EventName": "UNC_P_FREQ_BAND3_TRANSITIONS",
"Filter": "edge=1",
"MetricExpr": "(UNC_P_FREQ_BAND3_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_band3_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details. Derived from unc_p_power_state_occupancy.cores_c0",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C0. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C0",
"Filter": "occ_sel=1",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C0 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c0 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details. Derived from unc_p_power_state_occupancy.cores_c3",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C3. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events and occupancy triggering to capture other details",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C3",
"Filter": "occ_sel=2",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C3 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c3 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events . Derived from unc_p_power_state_occupancy.cores_c6",
+ "BriefDescription": "This is an occupancy event that tracks the number of cores that are in C6. It can be used by itself to get the average number of cores in C0, with threshholding to generate histograms, or with other PCU events ",
"Counter": "0,1,2,3",
"EventCode": "0x80",
"EventName": "UNC_P_POWER_STATE_OCCUPANCY.CORES_C6",
"Filter": "occ_sel=3",
"MetricExpr": "(UNC_P_POWER_STATE_OCCUPANCY.CORES_C6 / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "power_state_occupancy.cores_c6 %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode. This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip. Derived from unc_p_prochot_external_cycles",
+ "BriefDescription": "Counts the number of cycles that we are in external PROCHOT mode. This mode is triggered when a sensor off the die determines that something off-die (like DRAM) is too hot and must throttle to avoid damaging the chip",
"Counter": "0,1,2,3",
"EventCode": "0xa",
"EventName": "UNC_P_PROCHOT_EXTERNAL_CYCLES",
"MetricExpr": "(UNC_P_PROCHOT_EXTERNAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "prochot_external_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency. Derived from unc_p_freq_max_limit_thermal_cycles",
+ "BriefDescription": "Counts the number of cycles when temperature is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x4",
"EventName": "UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_LIMIT_THERMAL_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_limit_thermal_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency. Derived from unc_p_freq_max_os_cycles",
+ "BriefDescription": "Counts the number of cycles when the OS is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x6",
"EventName": "UNC_P_FREQ_MAX_OS_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_OS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_os_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency. Derived from unc_p_freq_max_power_cycles",
+ "BriefDescription": "Counts the number of cycles when power is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x5",
"EventName": "UNC_P_FREQ_MAX_POWER_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_POWER_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_power_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency. Derived from unc_p_freq_max_current_cycles",
+ "BriefDescription": "Counts the number of cycles when current is the upper limit on frequency",
"Counter": "0,1,2,3",
"EventCode": "0x7",
"EventName": "UNC_P_FREQ_MAX_CURRENT_CYCLES",
"MetricExpr": "(UNC_P_FREQ_MAX_CURRENT_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_max_current_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
{
- "BriefDescription": "Cycles spent changing Frequency. Derived from unc_p_freq_trans_cycles",
+ "BriefDescription": "Cycles spent changing Frequency",
"Counter": "0,1,2,3",
"EventName": "UNC_P_FREQ_TRANS_CYCLES",
"MetricExpr": "(UNC_P_FREQ_TRANS_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_trans_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -172,6 +189,7 @@
"EventName": "UNC_P_FREQ_GE_1200MHZ_CYCLES",
"Filter": "filter_band0=1200",
"MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_1200mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -182,6 +200,7 @@
"EventName": "UNC_P_FREQ_GE_2000MHZ_CYCLES",
"Filter": "filter_band1=2000",
"MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_2000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -192,6 +211,7 @@
"EventName": "UNC_P_FREQ_GE_3000MHZ_CYCLES",
"Filter": "filter_band2=3000",
"MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_3000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -202,6 +222,7 @@
"EventName": "UNC_P_FREQ_GE_4000MHZ_CYCLES",
"Filter": "filter_band3=4000",
"MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_4000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -212,6 +233,7 @@
"EventName": "UNC_P_FREQ_GE_1200MHZ_TRANSITIONS",
"Filter": "edge=1,filter_band0=1200",
"MetricExpr": "(UNC_P_FREQ_GE_1200MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_1200mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -222,6 +244,7 @@
"EventName": "UNC_P_FREQ_GE_2000MHZ_TRANSITIONS",
"Filter": "edge=1,filter_band1=2000",
"MetricExpr": "(UNC_P_FREQ_GE_2000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_2000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -232,6 +255,7 @@
"EventName": "UNC_P_FREQ_GE_3000MHZ_TRANSITIONS",
"Filter": "edge=1,filter_band2=4000",
"MetricExpr": "(UNC_P_FREQ_GE_3000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_3000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
},
@@ -242,6 +266,7 @@
"EventName": "UNC_P_FREQ_GE_4000MHZ_TRANSITIONS",
"Filter": "edge=1,filter_band3=4000",
"MetricExpr": "(UNC_P_FREQ_GE_4000MHZ_CYCLES / UNC_P_CLOCKTICKS) * 100.",
+ "MetricName": "freq_ge_4000mhz_cycles %",
"PerPkg": "1",
"Unit": "PCU"
}
diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv
index 12181bb1da2a..d1a12e584c1b 100644
--- a/tools/perf/pmu-events/arch/x86/mapfile.csv
+++ b/tools/perf/pmu-events/arch/x86/mapfile.csv
@@ -17,6 +17,7 @@ GenuineIntel-6-3A,v18,ivybridge,core
GenuineIntel-6-3E,v19,ivytown,core
GenuineIntel-6-2D,v20,jaketown,core
GenuineIntel-6-57,v9,knightslanding,core
+GenuineIntel-6-85,v9,knightslanding,core
GenuineIntel-6-1E,v2,nehalemep,core
GenuineIntel-6-1F,v2,nehalemep,core
GenuineIntel-6-1A,v2,nehalemep,core
diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/uncore.json b/tools/perf/pmu-events/arch/x86/sandybridge/uncore.json
new file mode 100644
index 000000000000..42c70eed05a2
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/sandybridge/uncore.json
@@ -0,0 +1,314 @@
+[
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x01",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.MISS",
+ "BriefDescription": "A snoop misses in some processor core.",
+ "PublicDescription": "A snoop misses in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x02",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.INVAL",
+ "BriefDescription": "A snoop invalidates a non-modified line in some processor core.",
+ "PublicDescription": "A snoop invalidates a non-modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x04",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HIT",
+ "BriefDescription": "A snoop hits a non-modified line in some processor core.",
+ "PublicDescription": "A snoop hits a non-modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x08",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HITM",
+ "BriefDescription": "A snoop hits a modified line in some processor core.",
+ "PublicDescription": "A snoop hits a modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x10",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.INVAL_M",
+ "BriefDescription": "A snoop invalidates a modified line in some processor core.",
+ "PublicDescription": "A snoop invalidates a modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x20",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.EXTERNAL_FILTER",
+ "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to external snoop request.",
+ "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to external snoop request.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x40",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.XCORE_FILTER",
+ "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to processor core memory request.",
+ "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to processor core memory request.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x80",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.EVICTION_FILTER",
+ "BriefDescription": "Filter on cross-core snoops initiated by this Cbox due to LLC eviction.",
+ "PublicDescription": "Filter on cross-core snoops initiated by this Cbox due to LLC eviction.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x01",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.M",
+ "BriefDescription": "LLC lookup request that access cache and found line in M-state.",
+ "PublicDescription": "LLC lookup request that access cache and found line in M-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x02",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.E",
+ "BriefDescription": "LLC lookup request that access cache and found line in E-state.",
+ "PublicDescription": "LLC lookup request that access cache and found line in E-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x04",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.S",
+ "BriefDescription": "LLC lookup request that access cache and found line in S-state.",
+ "PublicDescription": "LLC lookup request that access cache and found line in S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x08",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.I",
+ "BriefDescription": "LLC lookup request that access cache and found line in I-state.",
+ "PublicDescription": "LLC lookup request that access cache and found line in I-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x10",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_FILTER",
+ "BriefDescription": "Filter on processor core initiated cacheable read requests.",
+ "PublicDescription": "Filter on processor core initiated cacheable read requests.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x20",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_FILTER",
+ "BriefDescription": "Filter on processor core initiated cacheable write requests.",
+ "PublicDescription": "Filter on processor core initiated cacheable write requests.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x40",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.EXTSNP_FILTER",
+ "BriefDescription": "Filter on external snoop requests.",
+ "PublicDescription": "Filter on external snoop requests.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x80",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_REQUEST_FILTER",
+ "BriefDescription": "Filter on any IRQ or IPQ initiated requests including uncacheable, non-coherent requests.",
+ "PublicDescription": "Filter on any IRQ or IPQ initiated requests including uncacheable, non-coherent requests.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x80",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
+ "BriefDescription": "Counts cycles weighted by the number of requests waiting for data returning from the memory controller. Accounts for coherent and non-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+ "PublicDescription": "Counts cycles weighted by the number of requests waiting for data returning from the memory controller. Accounts for coherent and non-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+ "Counter": "0",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x81",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+ "BriefDescription": "Counts the number of coherent and in-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+ "PublicDescription": "Counts the number of coherent and in-coherent requests initiated by IA cores, processor graphic units, or LLC.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x81",
+ "UMask": "0x20",
+ "EventName": "UNC_ARB_TRK_REQUESTS.WRITES",
+ "BriefDescription": "Counts the number of allocated write entries, include full, partial, and LLC evictions.",
+ "PublicDescription": "Counts the number of allocated write entries, include full, partial, and LLC evictions.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x81",
+ "UMask": "0x80",
+ "EventName": "UNC_ARB_TRK_REQUESTS.EVICTIONS",
+ "BriefDescription": "Counts the number of LLC evictions allocated.",
+ "PublicDescription": "Counts the number of LLC evictions allocated.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x83",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_COH_TRK_OCCUPANCY.ALL",
+ "BriefDescription": "Cycles weighted by number of requests pending in Coherency Tracker.",
+ "PublicDescription": "Cycles weighted by number of requests pending in Coherency Tracker.",
+ "Counter": "0",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x84",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+ "BriefDescription": "Number of requests allocated in Coherency Tracker.",
+ "PublicDescription": "Number of requests allocated in Coherency Tracker.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x80",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
+ "BriefDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+ "PublicDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+ "Counter": "0,1",
+ "CounterMask": "1",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x80",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_OVER_HALF_FULL",
+ "BriefDescription": "Cycles with at least half of the requests outstanding are waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+ "PublicDescription": "Cycles with at least half of the requests outstanding are waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+ "Counter": "0,1",
+ "CounterMask": "10",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "ARB",
+ "EventCode": "0x0",
+ "UMask": "0x01",
+ "EventName": "UNC_CLOCK.SOCKET",
+ "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+ "PublicDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+ "Counter": "Fixed",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x06",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ES",
+ "BriefDescription": "LLC lookup request that access cache and found line in E-state or S-state.",
+ "PublicDescription": "LLC lookup request that access cache and found line in E-state or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ }
+] \ No newline at end of file
diff --git a/tools/perf/pmu-events/arch/x86/skylake/uncore.json b/tools/perf/pmu-events/arch/x86/skylake/uncore.json
new file mode 100644
index 000000000000..dbc193252fb3
--- /dev/null
+++ b/tools/perf/pmu-events/arch/x86/skylake/uncore.json
@@ -0,0 +1,254 @@
+[
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x41",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_XCORE",
+ "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+ "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which misses in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x81",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.MISS_EVICTION",
+ "BriefDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+ "PublicDescription": "A cross-core snoop resulted from L3 Eviction which misses in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x44",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HIT_XCORE",
+ "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+ "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a non-modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x22",
+ "UMask": "0x48",
+ "EventName": "UNC_CBO_XSNP_RESPONSE.HITM_XCORE",
+ "BriefDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+ "PublicDescription": "A cross-core snoop initiated by this Cbox due to processor core memory request which hits a modified line in some processor core.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x21",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_M",
+ "BriefDescription": "L3 Lookup write request that access cache and found line in M-state",
+ "PublicDescription": "L3 Lookup write request that access cache and found line in M-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x81",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_M",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in M-state",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in M-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x18",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_I",
+ "BriefDescription": "L3 Lookup read request that access cache and found line in I-state",
+ "PublicDescription": "L3 Lookup read request that access cache and found line in I-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x88",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_I",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in I-state",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in I-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x1f",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_MESI",
+ "BriefDescription": "L3 Lookup read request that access cache and found line in any MESI-state",
+ "PublicDescription": "L3 Lookup read request that access cache and found line in any MESI-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x2f",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_MESI",
+ "BriefDescription": "L3 Lookup write request that access cache and found line in MESI-state",
+ "PublicDescription": "L3 Lookup write request that access cache and found line in MESI-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x8f",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_MESI",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in MESI-state",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in MESI-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x86",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.ANY_ES",
+ "BriefDescription": "L3 Lookup any request that access cache and found line in E or S-state",
+ "PublicDescription": "L3 Lookup any request that access cache and found line in E or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x16",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.READ_ES",
+ "BriefDescription": "L3 Lookup read request that access cache and found line in E or S-state",
+ "PublicDescription": "L3 Lookup read request that access cache and found line in E or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "CBO",
+ "EventCode": "0x34",
+ "UMask": "0x26",
+ "EventName": "UNC_CBO_CACHE_LOOKUP.WRITE_ES",
+ "BriefDescription": "L3 Lookup write request that access cache and found line in E or S-state",
+ "PublicDescription": "L3 Lookup write request that access cache and found line in E or S-state.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x80",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.ALL",
+ "BriefDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from its allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+ "PublicDescription": "Each cycle count number of all Core outgoing valid entries. Such entry is defined as valid from its allocation till first of IDI0 or DRS0 messages is sent out. Accounts for Coherent and non-coherent traffic.",
+ "Counter": "0",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x81",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_REQUESTS.ALL",
+ "BriefDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+ "PublicDescription": "Total number of Core outgoing entries allocated. Accounts for Coherent and non-coherent traffic.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x81",
+ "UMask": "0x02",
+ "EventName": "UNC_ARB_TRK_REQUESTS.DRD_DIRECT",
+ "BriefDescription": "Number of Core coherent Data Read entries allocated in DirectData mode",
+ "PublicDescription": "Number of Core coherent Data Read entries allocated in DirectData mode.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x81",
+ "UMask": "0x20",
+ "EventName": "UNC_ARB_TRK_REQUESTS.WRITES",
+ "BriefDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+ "PublicDescription": "Number of Writes allocated - any write transactions: full/partials writes and evictions.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x84",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_COH_TRK_REQUESTS.ALL",
+ "BriefDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+ "PublicDescription": "Number of entries allocated. Account for Any type: e.g. Snoop, Core aperture, etc.",
+ "Counter": "0,1",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "iMPH-U",
+ "EventCode": "0x80",
+ "UMask": "0x01",
+ "EventName": "UNC_ARB_TRK_OCCUPANCY.CYCLES_WITH_ANY_REQUEST",
+ "BriefDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.;",
+ "PublicDescription": "Cycles with at least one request outstanding is waiting for data return from memory controller. Account for coherent and non-coherent requests initiated by IA Cores, Processor Graphics Unit, or LLC.",
+ "Counter": "0",
+ "CounterMask": "1",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ },
+ {
+ "Unit": "NCU",
+ "EventCode": "0x0",
+ "UMask": "0x01",
+ "EventName": "UNC_CLOCK.SOCKET",
+ "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles",
+ "PublicDescription": "This 48-bit fixed counter counts the UCLK cycles.",
+ "Counter": "FIXED",
+ "CounterMask": "0",
+ "Invert": "0",
+ "EdgeDetect": "0"
+ }
+] \ No newline at end of file
diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c
index eed09346a72a..baa073f38334 100644
--- a/tools/perf/pmu-events/jevents.c
+++ b/tools/perf/pmu-events/jevents.c
@@ -195,6 +195,7 @@ static struct map {
{ "CBO", "uncore_cbox" },
{ "QPI LL", "uncore_qpi" },
{ "SBO", "uncore_sbox" },
+ { "iMPH-U", "uncore_arb" },
{}
};
@@ -291,7 +292,9 @@ static void print_events_table_prefix(FILE *fp, const char *tblname)
static int print_events_table_entry(void *data, char *name, char *event,
char *desc, char *long_desc,
- char *pmu, char *unit, char *perpkg)
+ char *pmu, char *unit, char *perpkg,
+ char *metric_expr,
+ char *metric_name)
{
struct perf_entry_data *pd = data;
FILE *outfp = pd->outfp;
@@ -315,6 +318,10 @@ static int print_events_table_entry(void *data, char *name, char *event,
fprintf(outfp, "\t.unit = \"%s\",\n", unit);
if (perpkg)
fprintf(outfp, "\t.perpkg = \"%s\",\n", perpkg);
+ if (metric_expr)
+ fprintf(outfp, "\t.metric_expr = \"%s\",\n", metric_expr);
+ if (metric_name)
+ fprintf(outfp, "\t.metric_name = \"%s\",\n", metric_name);
fprintf(outfp, "},\n");
return 0;
@@ -362,7 +369,9 @@ static char *real_event(const char *name, char *event)
int json_events(const char *fn,
int (*func)(void *data, char *name, char *event, char *desc,
char *long_desc,
- char *pmu, char *unit, char *perpkg),
+ char *pmu, char *unit, char *perpkg,
+ char *metric_expr,
+ char *metric_name),
void *data)
{
int err = -EIO;
@@ -388,6 +397,8 @@ int json_events(const char *fn,
char *filter = NULL;
char *perpkg = NULL;
char *unit = NULL;
+ char *metric_expr = NULL;
+ char *metric_name = NULL;
unsigned long long eventcode = 0;
struct msrmap *msr = NULL;
jsmntok_t *msrval = NULL;
@@ -398,6 +409,7 @@ int json_events(const char *fn,
for (j = 0; j < obj->size; j += 2) {
jsmntok_t *field, *val;
int nz;
+ char *s;
field = tok + j;
EXPECT(field->type == JSMN_STRING, tok + j,
@@ -444,7 +456,6 @@ int json_events(const char *fn,
NULL);
} else if (json_streq(map, field, "Unit")) {
const char *ppmu;
- char *s;
ppmu = field_to_perf(unit_to_pmu, map, val);
if (ppmu) {
@@ -458,12 +469,19 @@ int json_events(const char *fn,
}
addfield(map, &desc, ". ", "Unit: ", NULL);
addfield(map, &desc, "", pmu, NULL);
+ addfield(map, &desc, "", " ", NULL);
} else if (json_streq(map, field, "Filter")) {
addfield(map, &filter, "", "", val);
} else if (json_streq(map, field, "ScaleUnit")) {
addfield(map, &unit, "", "", val);
} else if (json_streq(map, field, "PerPkg")) {
addfield(map, &perpkg, "", "", val);
+ } else if (json_streq(map, field, "MetricName")) {
+ addfield(map, &metric_name, "", "", val);
+ } else if (json_streq(map, field, "MetricExpr")) {
+ addfield(map, &metric_expr, "", "", val);
+ for (s = metric_expr; *s; s++)
+ *s = tolower(*s);
}
/* ignore unknown fields */
}
@@ -488,7 +506,7 @@ int json_events(const char *fn,
fixname(name);
err = func(data, name, real_event(name, event), desc, long_desc,
- pmu, unit, perpkg);
+ pmu, unit, perpkg, metric_expr, metric_name);
free(event);
free(desc);
free(name);
@@ -498,6 +516,8 @@ int json_events(const char *fn,
free(filter);
free(perpkg);
free(unit);
+ free(metric_expr);
+ free(metric_name);
if (err)
break;
tok += j;
diff --git a/tools/perf/pmu-events/jevents.h b/tools/perf/pmu-events/jevents.h
index 71e13de31092..611fac01913d 100644
--- a/tools/perf/pmu-events/jevents.h
+++ b/tools/perf/pmu-events/jevents.h
@@ -5,7 +5,8 @@ int json_events(const char *fn,
int (*func)(void *data, char *name, char *event, char *desc,
char *long_desc,
char *pmu,
- char *unit, char *perpkg),
+ char *unit, char *perpkg, char *metric_expr,
+ char *metric_name),
void *data);
char *get_cpu_str(void);
diff --git a/tools/perf/pmu-events/pmu-events.h b/tools/perf/pmu-events/pmu-events.h
index c669a3cdb9f0..569eab3688dd 100644
--- a/tools/perf/pmu-events/pmu-events.h
+++ b/tools/perf/pmu-events/pmu-events.h
@@ -13,6 +13,8 @@ struct pmu_event {
const char *pmu;
const char *unit;
const char *perpkg;
+ const char *metric_expr;
+ const char *metric_name;
};
/*
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 1cb3d9b540e9..af58ebc243ef 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -38,6 +38,7 @@ perf-y += cpumap.o
perf-y += stat.o
perf-y += event_update.o
perf-y += event-times.o
+perf-y += expr.o
perf-y += backward-ring-buffer.o
perf-y += sdt.o
perf-y += is_printable_array.o
diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c
index 83c4669cbc5b..e6d7876c94c2 100644
--- a/tools/perf/tests/builtin-test.c
+++ b/tools/perf/tests/builtin-test.c
@@ -44,6 +44,10 @@ static struct test generic_tests[] = {
.func = test__parse_events,
},
{
+ .desc = "Simple expression parser",
+ .func = test__expr,
+ },
+ {
.desc = "PERF_RECORD_* events & perf_sample fields",
.func = test__PERF_RECORD,
},
@@ -460,7 +464,7 @@ static int perf_test__list(int argc, const char **argv)
return 0;
}
-int cmd_test(int argc, const char **argv, const char *prefix __maybe_unused)
+int cmd_test(int argc, const char **argv)
{
const char *test_usage[] = {
"perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]",
diff --git a/tools/perf/tests/cpumap.c b/tools/perf/tests/cpumap.c
index f168a85992d0..4478773cdb97 100644
--- a/tools/perf/tests/cpumap.c
+++ b/tools/perf/tests/cpumap.c
@@ -66,7 +66,7 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused,
TEST_ASSERT_VAL("wrong nr", map->nr == 2);
TEST_ASSERT_VAL("wrong cpu", map->map[0] == 1);
TEST_ASSERT_VAL("wrong cpu", map->map[1] == 256);
- TEST_ASSERT_VAL("wrong refcnt", atomic_read(&map->refcnt) == 1);
+ TEST_ASSERT_VAL("wrong refcnt", refcount_read(&map->refcnt) == 1);
cpu_map__put(map);
return 0;
}
diff --git a/tools/perf/tests/expr.c b/tools/perf/tests/expr.c
new file mode 100644
index 000000000000..6c6a3749aaf6
--- /dev/null
+++ b/tools/perf/tests/expr.c
@@ -0,0 +1,56 @@
+#include "util/debug.h"
+#include "util/expr.h"
+#include "tests.h"
+#include <stdlib.h>
+
+static int test(struct parse_ctx *ctx, const char *e, double val2)
+{
+ double val;
+
+ if (expr__parse(&val, ctx, &e))
+ TEST_ASSERT_VAL("parse test failed", 0);
+ TEST_ASSERT_VAL("unexpected value", val == val2);
+ return 0;
+}
+
+int test__expr(int subtest __maybe_unused)
+{
+ const char *p;
+ const char **other;
+ double val;
+ int ret;
+ struct parse_ctx ctx;
+ int num_other;
+
+ expr__ctx_init(&ctx);
+ expr__add_id(&ctx, "FOO", 1);
+ expr__add_id(&ctx, "BAR", 2);
+
+ ret = test(&ctx, "1+1", 2);
+ ret |= test(&ctx, "FOO+BAR", 3);
+ ret |= test(&ctx, "(BAR/2)%2", 1);
+ ret |= test(&ctx, "1 - -4", 5);
+ ret |= test(&ctx, "(FOO-1)*2 + (BAR/2)%2 - -4", 5);
+
+ if (ret)
+ return ret;
+
+ p = "FOO/0";
+ ret = expr__parse(&val, &ctx, &p);
+ TEST_ASSERT_VAL("division by zero", ret == 1);
+
+ p = "BAR/";
+ ret = expr__parse(&val, &ctx, &p);
+ TEST_ASSERT_VAL("missing operand", ret == 1);
+
+ TEST_ASSERT_VAL("find other",
+ expr__find_other("FOO + BAR + BAZ + BOZO", "FOO", &other, &num_other) == 0);
+ TEST_ASSERT_VAL("find other", num_other == 3);
+ TEST_ASSERT_VAL("find other", !strcmp(other[0], "BAR"));
+ TEST_ASSERT_VAL("find other", !strcmp(other[1], "BAZ"));
+ TEST_ASSERT_VAL("find other", !strcmp(other[2], "BOZO"));
+ TEST_ASSERT_VAL("find other", other[3] == NULL);
+ free((void *)other);
+
+ return 0;
+}
diff --git a/tools/perf/tests/sdt.c b/tools/perf/tests/sdt.c
index f59d210e1baf..26e5b7a0b839 100644
--- a/tools/perf/tests/sdt.c
+++ b/tools/perf/tests/sdt.c
@@ -43,7 +43,7 @@ static char *get_self_path(void)
{
char *buf = calloc(PATH_MAX, sizeof(char));
- if (buf && readlink("/proc/self/exe", buf, PATH_MAX) < 0) {
+ if (buf && readlink("/proc/self/exe", buf, PATH_MAX - 1) < 0) {
pr_debug("Failed to get correct path of perf\n");
free(buf);
return NULL;
diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h
index 1fa9b9d83aa5..631859629403 100644
--- a/tools/perf/tests/tests.h
+++ b/tools/perf/tests/tests.h
@@ -62,6 +62,7 @@ int test__sample_parsing(int subtest);
int test__keep_tracking(int subtest);
int test__parse_no_sample_id_all(int subtest);
int test__dwarf_unwind(int subtest);
+int test__expr(int subtest);
int test__hists_filter(int subtest);
int test__mmap_thread_lookup(int subtest);
int test__thread_mg_share(int subtest);
diff --git a/tools/perf/tests/thread-map.c b/tools/perf/tests/thread-map.c
index f2d2e542d0ee..a63d6945807b 100644
--- a/tools/perf/tests/thread-map.c
+++ b/tools/perf/tests/thread-map.c
@@ -29,7 +29,7 @@ int test__thread_map(int subtest __maybe_unused)
thread_map__comm(map, 0) &&
!strcmp(thread_map__comm(map, 0), NAME));
TEST_ASSERT_VAL("wrong refcnt",
- atomic_read(&map->refcnt) == 1);
+ refcount_read(&map->refcnt) == 1);
thread_map__put(map);
/* test dummy pid */
@@ -44,7 +44,7 @@ int test__thread_map(int subtest __maybe_unused)
thread_map__comm(map, 0) &&
!strcmp(thread_map__comm(map, 0), "dummy"));
TEST_ASSERT_VAL("wrong refcnt",
- atomic_read(&map->refcnt) == 1);
+ refcount_read(&map->refcnt) == 1);
thread_map__put(map);
return 0;
}
@@ -71,7 +71,7 @@ static int process_event(struct perf_tool *tool __maybe_unused,
thread_map__comm(threads, 0) &&
!strcmp(thread_map__comm(threads, 0), NAME));
TEST_ASSERT_VAL("wrong refcnt",
- atomic_read(&threads->refcnt) == 1);
+ refcount_read(&threads->refcnt) == 1);
thread_map__put(threads);
return 0;
}
diff --git a/tools/perf/tests/thread-mg-share.c b/tools/perf/tests/thread-mg-share.c
index 188b63140fc8..76686dd6f5ec 100644
--- a/tools/perf/tests/thread-mg-share.c
+++ b/tools/perf/tests/thread-mg-share.c
@@ -43,7 +43,7 @@ int test__thread_mg_share(int subtest __maybe_unused)
leader && t1 && t2 && t3 && other);
mg = leader->mg;
- TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 4);
+ TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 4);
/* test the map groups pointer is shared */
TEST_ASSERT_VAL("map groups don't match", mg == t1->mg);
@@ -71,25 +71,25 @@ int test__thread_mg_share(int subtest __maybe_unused)
machine__remove_thread(machine, other_leader);
other_mg = other->mg;
- TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&other_mg->refcnt), 2);
+ TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&other_mg->refcnt), 2);
TEST_ASSERT_VAL("map groups don't match", other_mg == other_leader->mg);
/* release thread group */
thread__put(leader);
- TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 3);
+ TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 3);
thread__put(t1);
- TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 2);
+ TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 2);
thread__put(t2);
- TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&mg->refcnt), 1);
+ TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&mg->refcnt), 1);
thread__put(t3);
/* release other group */
thread__put(other_leader);
- TEST_ASSERT_EQUAL("wrong refcnt", atomic_read(&other_mg->refcnt), 1);
+ TEST_ASSERT_EQUAL("wrong refcnt", refcount_read(&other_mg->refcnt), 1);
thread__put(other);
diff --git a/tools/perf/trace/beauty/Build b/tools/perf/trace/beauty/Build
new file mode 100644
index 000000000000..be95ac6ce845
--- /dev/null
+++ b/tools/perf/trace/beauty/Build
@@ -0,0 +1 @@
+libperf-y += statx.o
diff --git a/tools/perf/trace/beauty/beauty.h b/tools/perf/trace/beauty/beauty.h
new file mode 100644
index 000000000000..cf50be3f17a4
--- /dev/null
+++ b/tools/perf/trace/beauty/beauty.h
@@ -0,0 +1,24 @@
+#ifndef _PERF_TRACE_BEAUTY_H
+#define _PERF_TRACE_BEAUTY_H
+
+#include <linux/types.h>
+
+struct trace;
+struct thread;
+
+struct syscall_arg {
+ unsigned long val;
+ struct thread *thread;
+ struct trace *trace;
+ void *parm;
+ u8 idx;
+ u8 mask;
+};
+
+size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_STATX_FLAGS syscall_arg__scnprintf_statx_flags
+
+size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_arg *arg);
+#define SCA_STATX_MASK syscall_arg__scnprintf_statx_mask
+
+#endif /* _PERF_TRACE_BEAUTY_H */
diff --git a/tools/perf/trace/beauty/statx.c b/tools/perf/trace/beauty/statx.c
new file mode 100644
index 000000000000..5643b692af4c
--- /dev/null
+++ b/tools/perf/trace/beauty/statx.c
@@ -0,0 +1,72 @@
+/*
+ * trace/beauty/statx.c
+ *
+ * Copyright (C) 2017, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include "trace/beauty/beauty.h"
+#include <linux/kernel.h>
+#include <sys/types.h>
+#include <uapi/linux/fcntl.h>
+#include <uapi/linux/stat.h>
+
+size_t syscall_arg__scnprintf_statx_flags(char *bf, size_t size, struct syscall_arg *arg)
+{
+ int printed = 0, flags = arg->val;
+
+ if (flags == 0)
+ return scnprintf(bf, size, "SYNC_AS_STAT");
+#define P_FLAG(n) \
+ if (flags & AT_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ flags &= ~AT_##n; \
+ }
+
+ P_FLAG(SYMLINK_NOFOLLOW);
+ P_FLAG(REMOVEDIR);
+ P_FLAG(SYMLINK_FOLLOW);
+ P_FLAG(NO_AUTOMOUNT);
+ P_FLAG(EMPTY_PATH);
+ P_FLAG(STATX_FORCE_SYNC);
+ P_FLAG(STATX_DONT_SYNC);
+
+#undef P_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+ return printed;
+}
+
+size_t syscall_arg__scnprintf_statx_mask(char *bf, size_t size, struct syscall_arg *arg)
+{
+ int printed = 0, flags = arg->val;
+
+#define P_FLAG(n) \
+ if (flags & STATX_##n) { \
+ printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+ flags &= ~STATX_##n; \
+ }
+
+ P_FLAG(TYPE);
+ P_FLAG(MODE);
+ P_FLAG(NLINK);
+ P_FLAG(UID);
+ P_FLAG(GID);
+ P_FLAG(ATIME);
+ P_FLAG(MTIME);
+ P_FLAG(CTIME);
+ P_FLAG(INO);
+ P_FLAG(SIZE);
+ P_FLAG(BLOCKS);
+ P_FLAG(BTIME);
+
+#undef P_FLAG
+
+ if (flags)
+ printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+ return printed;
+}
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index fc4fb669ceee..da24072bb76e 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -144,9 +144,60 @@ static void callchain_list__set_folding(struct callchain_list *cl, bool unfold)
cl->unfolded = unfold ? cl->has_children : false;
}
+static struct inline_node *inline_node__create(struct map *map, u64 ip)
+{
+ struct dso *dso;
+ struct inline_node *node;
+
+ if (map == NULL)
+ return NULL;
+
+ dso = map->dso;
+ if (dso == NULL)
+ return NULL;
+
+ if (dso->kernel != DSO_TYPE_USER)
+ return NULL;
+
+ node = dso__parse_addr_inlines(dso,
+ map__rip_2objdump(map, ip));
+
+ return node;
+}
+
+static int inline__count_rows(struct inline_node *node)
+{
+ struct inline_list *ilist;
+ int i = 0;
+
+ if (node == NULL)
+ return 0;
+
+ list_for_each_entry(ilist, &node->val, list) {
+ if ((ilist->filename != NULL) || (ilist->funcname != NULL))
+ i++;
+ }
+
+ return i;
+}
+
+static int callchain_list__inline_rows(struct callchain_list *chain)
+{
+ struct inline_node *node;
+ int rows;
+
+ node = inline_node__create(chain->ms.map, chain->ip);
+ if (node == NULL)
+ return 0;
+
+ rows = inline__count_rows(node);
+ inline_node__delete(node);
+ return rows;
+}
+
static int callchain_node__count_rows_rb_tree(struct callchain_node *node)
{
- int n = 0;
+ int n = 0, inline_rows;
struct rb_node *nd;
for (nd = rb_first(&node->rb_root); nd; nd = rb_next(nd)) {
@@ -156,6 +207,13 @@ static int callchain_node__count_rows_rb_tree(struct callchain_node *node)
list_for_each_entry(chain, &child->val, list) {
++n;
+
+ if (symbol_conf.inline_name) {
+ inline_rows =
+ callchain_list__inline_rows(chain);
+ n += inline_rows;
+ }
+
/* We need this because we may not have children */
folded_sign = callchain_list__folded(chain);
if (folded_sign == '+')
@@ -207,7 +265,7 @@ static int callchain_node__count_rows(struct callchain_node *node)
{
struct callchain_list *chain;
bool unfolded = false;
- int n = 0;
+ int n = 0, inline_rows;
if (callchain_param.mode == CHAIN_FLAT)
return callchain_node__count_flat_rows(node);
@@ -216,6 +274,11 @@ static int callchain_node__count_rows(struct callchain_node *node)
list_for_each_entry(chain, &node->val, list) {
++n;
+ if (symbol_conf.inline_name) {
+ inline_rows = callchain_list__inline_rows(chain);
+ n += inline_rows;
+ }
+
unfolded = chain->unfolded;
}
@@ -362,6 +425,19 @@ static void hist_entry__init_have_children(struct hist_entry *he)
he->init_have_children = true;
}
+static void hist_entry_init_inline_node(struct hist_entry *he)
+{
+ if (he->inline_node)
+ return;
+
+ he->inline_node = inline_node__create(he->ms.map, he->ip);
+
+ if (he->inline_node == NULL)
+ return;
+
+ he->has_children = true;
+}
+
static bool hist_browser__toggle_fold(struct hist_browser *browser)
{
struct hist_entry *he = browser->he_selection;
@@ -393,7 +469,12 @@ static bool hist_browser__toggle_fold(struct hist_browser *browser)
if (he->unfolded) {
if (he->leaf)
- he->nr_rows = callchain__count_rows(&he->sorted_chain);
+ if (he->inline_node)
+ he->nr_rows = inline__count_rows(
+ he->inline_node);
+ else
+ he->nr_rows = callchain__count_rows(
+ &he->sorted_chain);
else
he->nr_rows = hierarchy_count_rows(browser, he, false);
@@ -753,6 +834,71 @@ static bool hist_browser__check_dump_full(struct hist_browser *browser __maybe_u
#define LEVEL_OFFSET_STEP 3
+static int hist_browser__show_inline(struct hist_browser *browser,
+ struct inline_node *node,
+ unsigned short row,
+ int offset)
+{
+ struct inline_list *ilist;
+ char buf[1024];
+ int color, width, first_row;
+
+ first_row = row;
+ width = browser->b.width - (LEVEL_OFFSET_STEP + 2);
+ list_for_each_entry(ilist, &node->val, list) {
+ if ((ilist->filename != NULL) || (ilist->funcname != NULL)) {
+ color = HE_COLORSET_NORMAL;
+ if (ui_browser__is_current_entry(&browser->b, row))
+ color = HE_COLORSET_SELECTED;
+
+ if (callchain_param.key == CCKEY_ADDRESS ||
+ callchain_param.key == CCKEY_SRCLINE) {
+ if (ilist->filename != NULL)
+ scnprintf(buf, sizeof(buf),
+ "%s:%d (inline)",
+ ilist->filename,
+ ilist->line_nr);
+ else
+ scnprintf(buf, sizeof(buf), "??");
+ } else if (ilist->funcname != NULL)
+ scnprintf(buf, sizeof(buf), "%s (inline)",
+ ilist->funcname);
+ else if (ilist->filename != NULL)
+ scnprintf(buf, sizeof(buf),
+ "%s:%d (inline)",
+ ilist->filename,
+ ilist->line_nr);
+ else
+ scnprintf(buf, sizeof(buf), "??");
+
+ ui_browser__set_color(&browser->b, color);
+ hist_browser__gotorc(browser, row, 0);
+ ui_browser__write_nstring(&browser->b, " ",
+ LEVEL_OFFSET_STEP + offset);
+ ui_browser__write_nstring(&browser->b, buf, width);
+ row++;
+ }
+ }
+
+ return row - first_row;
+}
+
+static size_t show_inline_list(struct hist_browser *browser, struct map *map,
+ u64 ip, int row, int offset)
+{
+ struct inline_node *node;
+ int ret;
+
+ node = inline_node__create(map, ip);
+ if (node == NULL)
+ return 0;
+
+ ret = hist_browser__show_inline(browser, node, row, offset);
+
+ inline_node__delete(node);
+ return ret;
+}
+
static int hist_browser__show_callchain_list(struct hist_browser *browser,
struct callchain_node *node,
struct callchain_list *chain,
@@ -764,6 +910,7 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser,
char bf[1024], *alloc_str;
char buf[64], *alloc_str2;
const char *str;
+ int inline_rows = 0, ret = 1;
if (arg->row_offset != 0) {
arg->row_offset--;
@@ -801,10 +948,15 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser,
}
print(browser, chain, str, offset, row, arg);
-
free(alloc_str);
free(alloc_str2);
- return 1;
+
+ if (symbol_conf.inline_name) {
+ inline_rows = show_inline_list(browser, chain->ms.map,
+ chain->ip, row + 1, offset);
+ }
+
+ return ret + inline_rows;
}
static bool check_percent_display(struct rb_node *node, u64 parent_total)
@@ -1228,6 +1380,12 @@ static int hist_browser__show_entry(struct hist_browser *browser,
folded_sign = hist_entry__folded(entry);
}
+ if (symbol_conf.inline_name &&
+ (!entry->has_children)) {
+ hist_entry_init_inline_node(entry);
+ folded_sign = hist_entry__folded(entry);
+ }
+
if (row_offset == 0) {
struct hpp_arg arg = {
.b = &browser->b,
@@ -1259,7 +1417,8 @@ static int hist_browser__show_entry(struct hist_browser *browser,
}
if (first) {
- if (symbol_conf.use_callchain) {
+ if (symbol_conf.use_callchain ||
+ symbol_conf.inline_name) {
ui_browser__printf(&browser->b, "%c ", folded_sign);
width -= 2;
}
@@ -1301,8 +1460,14 @@ static int hist_browser__show_entry(struct hist_browser *browser,
.is_current_entry = current_entry,
};
- printed += hist_browser__show_callchain(browser, entry, 1, row,
- hist_browser__show_callchain_entry, &arg,
+ if (entry->inline_node)
+ printed += hist_browser__show_inline(browser,
+ entry->inline_node, row, 0);
+ else
+ printed += hist_browser__show_callchain(browser,
+ entry, 1, row,
+ hist_browser__show_callchain_entry,
+ &arg,
hist_browser__check_output_full);
}
@@ -2308,7 +2473,7 @@ static int switch_data_file(void)
return ret;
memset(options, 0, sizeof(options));
- memset(options, 0, sizeof(abs_path));
+ memset(abs_path, 0, sizeof(abs_path));
while ((dent = readdir(pwd_dir))) {
char path[PATH_MAX];
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 668f4aecf2e6..d52d5f64ea89 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -17,6 +17,67 @@ static size_t callchain__fprintf_left_margin(FILE *fp, int left_margin)
return ret;
}
+static size_t inline__fprintf(struct map *map, u64 ip, int left_margin,
+ int depth, int depth_mask, FILE *fp)
+{
+ struct dso *dso;
+ struct inline_node *node;
+ struct inline_list *ilist;
+ int ret = 0, i;
+
+ if (map == NULL)
+ return 0;
+
+ dso = map->dso;
+ if (dso == NULL)
+ return 0;
+
+ if (dso->kernel != DSO_TYPE_USER)
+ return 0;
+
+ node = dso__parse_addr_inlines(dso,
+ map__rip_2objdump(map, ip));
+ if (node == NULL)
+ return 0;
+
+ list_for_each_entry(ilist, &node->val, list) {
+ if ((ilist->filename != NULL) || (ilist->funcname != NULL)) {
+ ret += callchain__fprintf_left_margin(fp, left_margin);
+
+ for (i = 0; i < depth; i++) {
+ if (depth_mask & (1 << i))
+ ret += fprintf(fp, "|");
+ else
+ ret += fprintf(fp, " ");
+ ret += fprintf(fp, " ");
+ }
+
+ if (callchain_param.key == CCKEY_ADDRESS ||
+ callchain_param.key == CCKEY_SRCLINE) {
+ if (ilist->filename != NULL)
+ ret += fprintf(fp, "%s:%d (inline)",
+ ilist->filename,
+ ilist->line_nr);
+ else
+ ret += fprintf(fp, "??");
+ } else if (ilist->funcname != NULL)
+ ret += fprintf(fp, "%s (inline)",
+ ilist->funcname);
+ else if (ilist->filename != NULL)
+ ret += fprintf(fp, "%s:%d (inline)",
+ ilist->filename,
+ ilist->line_nr);
+ else
+ ret += fprintf(fp, "??");
+
+ ret += fprintf(fp, "\n");
+ }
+ }
+
+ inline_node__delete(node);
+ return ret;
+}
+
static size_t ipchain__fprintf_graph_line(FILE *fp, int depth, int depth_mask,
int left_margin)
{
@@ -78,6 +139,10 @@ static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_node *node,
fputs(str, fp);
fputc('\n', fp);
free(alloc_str);
+
+ if (symbol_conf.inline_name)
+ ret += inline__fprintf(chain->ms.map, chain->ip,
+ left_margin, depth, depth_mask, fp);
return ret;
}
@@ -229,6 +294,7 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
if (!i++ && field_order == NULL &&
sort_order && !prefixcmp(sort_order, "sym"))
continue;
+
if (!printed) {
ret += callchain__fprintf_left_margin(fp, left_margin);
ret += fprintf(fp, "|\n");
@@ -251,6 +317,13 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
if (++entries_printed == callchain_param.print_limit)
break;
+
+ if (symbol_conf.inline_name)
+ ret += inline__fprintf(chain->ms.map,
+ chain->ip,
+ left_margin,
+ 0, 0,
+ fp);
}
root = &cnode->rb_root;
}
@@ -529,6 +602,8 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
bool use_callchain)
{
int ret;
+ int callchain_ret = 0;
+ int inline_ret = 0;
struct perf_hpp hpp = {
.buf = bf,
.size = size,
@@ -547,7 +622,16 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
ret = fprintf(fp, "%s\n", bf);
if (use_callchain)
- ret += hist_entry_callchain__fprintf(he, total_period, 0, fp);
+ callchain_ret = hist_entry_callchain__fprintf(he, total_period,
+ 0, fp);
+
+ if (callchain_ret == 0 && symbol_conf.inline_name) {
+ inline_ret = inline__fprintf(he->ms.map, he->ip, 0, 0, 0, fp);
+ ret += inline_ret;
+ if (inline_ret > 0)
+ ret += fprintf(fp, "\n");
+ } else
+ ret += callchain_ret;
return ret;
}
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 5da376bc1afc..5c0ea11a8f0a 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -1,4 +1,3 @@
-libperf-y += alias.o
libperf-y += annotate.o
libperf-y += block-range.o
libperf-y += build-id.o
@@ -42,6 +41,7 @@ libperf-y += pstack.o
libperf-y += session.o
libperf-$(CONFIG_AUDIT) += syscalltbl.o
libperf-y += ordered-events.o
+libperf-y += namespaces.o
libperf-y += comm.o
libperf-y += thread.o
libperf-y += thread_map.o
@@ -81,6 +81,7 @@ libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
libperf-$(CONFIG_AUXTRACE) += intel-pt.o
libperf-$(CONFIG_AUXTRACE) += intel-bts.o
libperf-y += parse-branch-options.o
+libperf-y += dump-insn.o
libperf-y += parse-regs-options.o
libperf-y += term.o
libperf-y += help-unknown-cmd.o
@@ -88,6 +89,7 @@ libperf-y += mem-events.o
libperf-y += vsprintf.o
libperf-y += drv_configs.o
libperf-y += time-utils.o
+libperf-y += expr-bison.o
libperf-$(CONFIG_LIBBPF) += bpf-loader.o
libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
@@ -140,6 +142,10 @@ $(OUTPUT)util/parse-events-bison.c: util/parse-events.y
$(call rule_mkdir)
$(Q)$(call echo-cmd,bison)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $@ -p parse_events_
+$(OUTPUT)util/expr-bison.c: util/expr.y
+ $(call rule_mkdir)
+ $(Q)$(call echo-cmd,bison)$(BISON) -v util/expr.y -d $(PARSER_DEBUG_BISON) -o $@ -p expr__
+
$(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c
$(call rule_mkdir)
$(Q)$(call echo-cmd,flex)$(FLEX) -o $@ --header-file=$(OUTPUT)util/pmu-flex.h util/pmu.l
@@ -152,6 +158,7 @@ CFLAGS_parse-events-flex.o += -w
CFLAGS_pmu-flex.o += -w
CFLAGS_parse-events-bison.o += -DYYENABLE_NLS=0 -w
CFLAGS_pmu-bison.o += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
+CFLAGS_expr-bison.o += -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w
$(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
$(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
diff --git a/tools/perf/util/alias.c b/tools/perf/util/alias.c
deleted file mode 100644
index 6455471d9cd1..000000000000
--- a/tools/perf/util/alias.c
+++ /dev/null
@@ -1,78 +0,0 @@
-#include "cache.h"
-#include "util.h"
-#include "config.h"
-
-static const char *alias_key;
-static char *alias_val;
-
-static int alias_lookup_cb(const char *k, const char *v,
- void *cb __maybe_unused)
-{
- if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
- if (!v)
- return config_error_nonbool(k);
- alias_val = strdup(v);
- return 0;
- }
- return 0;
-}
-
-char *alias_lookup(const char *alias)
-{
- alias_key = alias;
- alias_val = NULL;
- perf_config(alias_lookup_cb, NULL);
- return alias_val;
-}
-
-int split_cmdline(char *cmdline, const char ***argv)
-{
- int src, dst, count = 0, size = 16;
- char quoted = 0;
-
- *argv = malloc(sizeof(char*) * size);
-
- /* split alias_string */
- (*argv)[count++] = cmdline;
- for (src = dst = 0; cmdline[src];) {
- char c = cmdline[src];
- if (!quoted && isspace(c)) {
- cmdline[dst++] = 0;
- while (cmdline[++src]
- && isspace(cmdline[src]))
- ; /* skip */
- if (count >= size) {
- size += 16;
- *argv = realloc(*argv, sizeof(char*) * size);
- }
- (*argv)[count++] = cmdline + dst;
- } else if (!quoted && (c == '\'' || c == '"')) {
- quoted = c;
- src++;
- } else if (c == quoted) {
- quoted = 0;
- src++;
- } else {
- if (c == '\\' && quoted != '\'') {
- src++;
- c = cmdline[src];
- if (!c) {
- zfree(argv);
- return error("cmdline ends with \\");
- }
- }
- cmdline[dst++] = c;
- src++;
- }
- }
-
- cmdline[dst] = 0;
-
- if (quoted) {
- zfree(argv);
- return error("unclosed quote");
- }
-
- return count;
-}
-
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 273f21fa32b5..a37032bd137d 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1307,6 +1307,7 @@ static int dso__disassemble_filename(struct dso *dso, char *filename, size_t fil
{
char linkname[PATH_MAX];
char *build_id_filename;
+ char *build_id_path = NULL;
if (dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS &&
!dso__is_kcore(dso))
@@ -1322,8 +1323,14 @@ static int dso__disassemble_filename(struct dso *dso, char *filename, size_t fil
goto fallback;
}
+ build_id_path = strdup(filename);
+ if (!build_id_path)
+ return -1;
+
+ dirname(build_id_path);
+
if (dso__is_kcore(dso) ||
- readlink(filename, linkname, sizeof(linkname)) < 0 ||
+ readlink(build_id_path, linkname, sizeof(linkname)) < 0 ||
strstr(linkname, DSO__NAME_KALLSYMS) ||
access(filename, R_OK)) {
fallback:
@@ -1335,6 +1342,7 @@ fallback:
__symbol__join_symfs(filename, filename_size, dso->long_name);
}
+ free(build_id_path);
return 0;
}
@@ -1435,7 +1443,7 @@ int symbol__disassemble(struct symbol *sym, struct map *map, const char *arch_na
snprintf(command, sizeof(command),
"%s %s%s --start-address=0x%016" PRIx64
" --stop-address=0x%016" PRIx64
- " -l -d %s %s -C %s 2>/dev/null|grep -v %s|expand",
+ " -l -d %s %s -C %s 2>/dev/null|grep -v %s:|expand",
objdump_path ? objdump_path : "objdump",
disassembler_style ? "-M " : "",
disassembler_style ? disassembler_style : "",
@@ -1482,6 +1490,12 @@ int symbol__disassemble(struct symbol *sym, struct map *map, const char *arch_na
nline = 0;
while (!feof(file)) {
+ /*
+ * The source code line number (lineno) needs to be kept in
+ * accross calls to symbol__parse_objdump_line(), so that it
+ * can associate it with the instructions till the next one.
+ * See disasm_line__new() and struct disasm_line::line_nr.
+ */
if (symbol__parse_objdump_line(sym, map, arch, file, privsize,
&lineno) < 0)
break;
@@ -1651,24 +1665,31 @@ static int symbol__get_source_line(struct symbol *sym, struct map *map,
start = map__rip_2objdump(map, sym->start);
for (i = 0; i < len; i++) {
- u64 offset;
+ u64 offset, nr_samples;
double percent_max = 0.0;
src_line->nr_pcnt = nr_pcnt;
for (k = 0; k < nr_pcnt; k++) {
+ double percent = 0.0;
+
h = annotation__histogram(notes, evidx + k);
- src_line->samples[k].percent = 100.0 * h->addr[i] / h->sum;
+ nr_samples = h->addr[i];
+ if (h->sum)
+ percent = 100.0 * nr_samples / h->sum;
- if (src_line->samples[k].percent > percent_max)
- percent_max = src_line->samples[k].percent;
+ if (percent > percent_max)
+ percent_max = percent;
+ src_line->samples[k].percent = percent;
+ src_line->samples[k].nr = nr_samples;
}
if (percent_max <= 0.5)
goto next;
offset = start + i;
- src_line->path = get_srcline(map->dso, offset, NULL, false);
+ src_line->path = get_srcline(map->dso, offset, NULL,
+ false, true);
insert_source_line(&tmp_root, src_line);
next:
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 09776b5af991..948aa8e6fd39 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -98,7 +98,7 @@ struct cyc_hist {
struct source_line_samples {
double percent;
double percent_sum;
- double nr;
+ u64 nr;
};
struct source_line {
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index c5a6e0b12452..78bd632f144d 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -1826,7 +1826,7 @@ static int addr_filter__resolve_kernel_syms(struct addr_filter *filt)
filt->addr = start;
if (filt->range && !filt->size && !filt->sym_to) {
filt->size = size;
- no_size = !!size;
+ no_size = !size;
}
}
@@ -1840,7 +1840,7 @@ static int addr_filter__resolve_kernel_syms(struct addr_filter *filt)
if (err)
return err;
filt->size = start + size - filt->addr;
- no_size = !!size;
+ no_size = !size;
}
/* The very last symbol in kallsyms does not imply a particular size */
diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index e528c40739cc..33af67530d30 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -182,13 +182,17 @@ char *build_id_cache__origname(const char *sbuild_id)
char buf[PATH_MAX];
char *ret = NULL, *p;
size_t offs = 5; /* == strlen("../..") */
+ ssize_t len;
linkname = build_id_cache__linkname(sbuild_id, NULL, 0);
if (!linkname)
return NULL;
- if (readlink(linkname, buf, PATH_MAX) < 0)
+ len = readlink(linkname, buf, sizeof(buf) - 1);
+ if (len <= 0)
goto out;
+ buf[len] = '\0';
+
/* The link should be "../..<origpath>/<sbuild_id>" */
p = strrchr(buf, '/'); /* Cut off the "/<sbuild_id>" */
if (p && (p > buf + offs)) {
@@ -690,7 +694,7 @@ int build_id_cache__add_s(const char *sbuild_id, const char *name,
err = 0;
/* Update SDT cache : error is just warned */
- if (build_id_cache__add_sdt_cache(sbuild_id, realname) < 0)
+ if (realname && build_id_cache__add_sdt_cache(sbuild_id, realname) < 0)
pr_debug4("Failed to update/scan SDT cache for %s\n", realname);
out_free:
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 512c0c83fbc6..0328f297a748 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -15,7 +15,6 @@
#define PERF_TRACEFS_ENVIRONMENT "PERF_TRACEFS_DIR"
#define PERF_PAGER_ENVIRONMENT "PERF_PAGER"
-char *alias_lookup(const char *alias);
int split_cmdline(char *cmdline, const char ***argv);
#define alloc_nr(x) (((x)+16)*3/2)
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index aba953421a03..3cea1fb5404b 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -80,6 +80,10 @@ static int parse_callchain_sort_key(const char *value)
callchain_param.key = CCKEY_ADDRESS;
return 0;
}
+ if (!strncmp(value, "srcline", strlen(value))) {
+ callchain_param.key = CCKEY_SRCLINE;
+ return 0;
+ }
if (!strncmp(value, "branch", strlen(value))) {
callchain_param.branch_callstack = 1;
return 0;
@@ -510,14 +514,51 @@ enum match_result {
MATCH_GT,
};
+static enum match_result match_chain_srcline(struct callchain_cursor_node *node,
+ struct callchain_list *cnode)
+{
+ char *left = get_srcline(cnode->ms.map->dso,
+ map__rip_2objdump(cnode->ms.map, cnode->ip),
+ cnode->ms.sym, true, false);
+ char *right = get_srcline(node->map->dso,
+ map__rip_2objdump(node->map, node->ip),
+ node->sym, true, false);
+ enum match_result ret = MATCH_EQ;
+ int cmp;
+
+ if (left && right)
+ cmp = strcmp(left, right);
+ else if (!left && right)
+ cmp = 1;
+ else if (left && !right)
+ cmp = -1;
+ else if (cnode->ip == node->ip)
+ cmp = 0;
+ else
+ cmp = (cnode->ip < node->ip) ? -1 : 1;
+
+ if (cmp != 0)
+ ret = cmp < 0 ? MATCH_LT : MATCH_GT;
+
+ free_srcline(left);
+ free_srcline(right);
+ return ret;
+}
+
static enum match_result match_chain(struct callchain_cursor_node *node,
struct callchain_list *cnode)
{
struct symbol *sym = node->sym;
u64 left, right;
- if (cnode->ms.sym && sym &&
- callchain_param.key == CCKEY_FUNCTION) {
+ if (callchain_param.key == CCKEY_SRCLINE) {
+ enum match_result match = match_chain_srcline(node, cnode);
+
+ if (match != MATCH_ERROR)
+ return match;
+ }
+
+ if (cnode->ms.sym && sym && callchain_param.key == CCKEY_FUNCTION) {
left = cnode->ms.sym->start;
right = sym->start;
} else {
@@ -911,15 +952,16 @@ out:
char *callchain_list__sym_name(struct callchain_list *cl,
char *bf, size_t bfsize, bool show_dso)
{
+ bool show_addr = callchain_param.key == CCKEY_ADDRESS;
+ bool show_srcline = show_addr || callchain_param.key == CCKEY_SRCLINE;
int printed;
if (cl->ms.sym) {
- if (callchain_param.key == CCKEY_ADDRESS &&
- cl->ms.map && !cl->srcline)
+ if (show_srcline && cl->ms.map && !cl->srcline)
cl->srcline = get_srcline(cl->ms.map->dso,
map__rip_2objdump(cl->ms.map,
cl->ip),
- cl->ms.sym, false);
+ cl->ms.sym, false, show_addr);
if (cl->srcline)
printed = scnprintf(bf, bfsize, "%s %s",
cl->ms.sym->name, cl->srcline);
@@ -1063,63 +1105,100 @@ int callchain_branch_counts(struct callchain_root *root,
cycles_count);
}
-static int callchain_counts_printf(FILE *fp, char *bf, int bfsize,
- u64 branch_count, u64 predicted_count,
- u64 abort_count, u64 cycles_count,
- u64 iter_count, u64 samples_count)
+static int counts_str_build(char *bf, int bfsize,
+ u64 branch_count, u64 predicted_count,
+ u64 abort_count, u64 cycles_count,
+ u64 iter_count, u64 samples_count)
{
double predicted_percent = 0.0;
const char *null_str = "";
char iter_str[32];
- char *str;
- u64 cycles = 0;
-
- if (branch_count == 0) {
- if (fp)
- return fprintf(fp, " (calltrace)");
+ char cycle_str[32];
+ char *istr, *cstr;
+ u64 cycles;
+ if (branch_count == 0)
return scnprintf(bf, bfsize, " (calltrace)");
- }
+
+ cycles = cycles_count / branch_count;
if (iter_count && samples_count) {
- scnprintf(iter_str, sizeof(iter_str),
- ", iterations:%" PRId64 "",
- iter_count / samples_count);
- str = iter_str;
+ if (cycles > 0)
+ scnprintf(iter_str, sizeof(iter_str),
+ " iterations:%" PRId64 "",
+ iter_count / samples_count);
+ else
+ scnprintf(iter_str, sizeof(iter_str),
+ "iterations:%" PRId64 "",
+ iter_count / samples_count);
+ istr = iter_str;
} else
- str = (char *)null_str;
+ istr = (char *)null_str;
+
+ if (cycles > 0) {
+ scnprintf(cycle_str, sizeof(cycle_str),
+ "cycles:%" PRId64 "", cycles);
+ cstr = cycle_str;
+ } else
+ cstr = (char *)null_str;
predicted_percent = predicted_count * 100.0 / branch_count;
- cycles = cycles_count / branch_count;
- if ((predicted_percent >= 100.0) && (abort_count == 0)) {
- if (fp)
- return fprintf(fp, " (cycles:%" PRId64 "%s)",
- cycles, str);
+ if ((predicted_count == branch_count) && (abort_count == 0)) {
+ if ((cycles > 0) || (istr != (char *)null_str))
+ return scnprintf(bf, bfsize, " (%s%s)", cstr, istr);
+ else
+ return scnprintf(bf, bfsize, "%s", (char *)null_str);
+ }
- return scnprintf(bf, bfsize, " (cycles:%" PRId64 "%s)",
- cycles, str);
+ if ((predicted_count < branch_count) && (abort_count == 0)) {
+ if ((cycles > 0) || (istr != (char *)null_str))
+ return scnprintf(bf, bfsize,
+ " (predicted:%.1f%% %s%s)",
+ predicted_percent, cstr, istr);
+ else {
+ return scnprintf(bf, bfsize,
+ " (predicted:%.1f%%)",
+ predicted_percent);
+ }
}
- if ((predicted_percent < 100.0) && (abort_count == 0)) {
- if (fp)
- return fprintf(fp,
- " (predicted:%.1f%%, cycles:%" PRId64 "%s)",
- predicted_percent, cycles, str);
+ if ((predicted_count == branch_count) && (abort_count > 0)) {
+ if ((cycles > 0) || (istr != (char *)null_str))
+ return scnprintf(bf, bfsize,
+ " (abort:%" PRId64 " %s%s)",
+ abort_count, cstr, istr);
+ else
+ return scnprintf(bf, bfsize,
+ " (abort:%" PRId64 ")",
+ abort_count);
+ }
+ if ((cycles > 0) || (istr != (char *)null_str))
return scnprintf(bf, bfsize,
- " (predicted:%.1f%%, cycles:%" PRId64 "%s)",
- predicted_percent, cycles, str);
- }
+ " (predicted:%.1f%% abort:%" PRId64 " %s%s)",
+ predicted_percent, abort_count, cstr, istr);
+
+ return scnprintf(bf, bfsize,
+ " (predicted:%.1f%% abort:%" PRId64 ")",
+ predicted_percent, abort_count);
+}
+
+static int callchain_counts_printf(FILE *fp, char *bf, int bfsize,
+ u64 branch_count, u64 predicted_count,
+ u64 abort_count, u64 cycles_count,
+ u64 iter_count, u64 samples_count)
+{
+ char str[128];
+
+ counts_str_build(str, sizeof(str), branch_count,
+ predicted_count, abort_count, cycles_count,
+ iter_count, samples_count);
if (fp)
- return fprintf(fp,
- " (predicted:%.1f%%, abort:%" PRId64 ", cycles:%" PRId64 "%s)",
- predicted_percent, abort_count, cycles, str);
+ return fprintf(fp, "%s", str);
- return scnprintf(bf, bfsize,
- " (predicted:%.1f%%, abort:%" PRId64 ", cycles:%" PRId64 "%s)",
- predicted_percent, abort_count, cycles, str);
+ return scnprintf(bf, bfsize, "%s", str);
}
int callchain_list_counts__printf_value(struct callchain_node *node,
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 4f4b60f1558a..c56c23dbbf72 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -77,7 +77,8 @@ typedef void (*sort_chain_func_t)(struct rb_root *, struct callchain_root *,
enum chain_key {
CCKEY_FUNCTION,
- CCKEY_ADDRESS
+ CCKEY_ADDRESS,
+ CCKEY_SRCLINE
};
enum chain_value {
diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c
index eafbf11442b2..86399eda3684 100644
--- a/tools/perf/util/cgroup.c
+++ b/tools/perf/util/cgroup.c
@@ -127,19 +127,19 @@ static int add_cgroup(struct perf_evlist *evlist, char *str)
goto found;
n++;
}
- if (atomic_read(&cgrp->refcnt) == 0)
+ if (refcount_read(&cgrp->refcnt) == 0)
free(cgrp);
return -1;
found:
- atomic_inc(&cgrp->refcnt);
+ refcount_inc(&cgrp->refcnt);
counter->cgrp = cgrp;
return 0;
}
void close_cgroup(struct cgroup_sel *cgrp)
{
- if (cgrp && atomic_dec_and_test(&cgrp->refcnt)) {
+ if (cgrp && refcount_dec_and_test(&cgrp->refcnt)) {
close(cgrp->fd);
zfree(&cgrp->name);
free(cgrp);
diff --git a/tools/perf/util/cgroup.h b/tools/perf/util/cgroup.h
index 31f8dcdbd7ef..d91966b97cbd 100644
--- a/tools/perf/util/cgroup.h
+++ b/tools/perf/util/cgroup.h
@@ -1,14 +1,14 @@
#ifndef __CGROUP_H__
#define __CGROUP_H__
-#include <linux/atomic.h>
+#include <linux/refcount.h>
struct option;
struct cgroup_sel {
char *name;
int fd;
- atomic_t refcnt;
+ refcount_t refcnt;
};
diff --git a/tools/perf/util/cloexec.h b/tools/perf/util/cloexec.h
index d0d465953d36..94a5a7d829d5 100644
--- a/tools/perf/util/cloexec.h
+++ b/tools/perf/util/cloexec.h
@@ -3,10 +3,4 @@
unsigned long perf_event_open_cloexec_flag(void);
-#ifdef __GLIBC_PREREQ
-#if !__GLIBC_PREREQ(2, 6) && !defined(__UCLIBC__)
-int sched_getcpu(void) __THROW;
-#endif
-#endif
-
#endif /* __PERF_CLOEXEC_H */
diff --git a/tools/perf/util/comm.c b/tools/perf/util/comm.c
index 21b7ff382c3f..32837b6f7879 100644
--- a/tools/perf/util/comm.c
+++ b/tools/perf/util/comm.c
@@ -2,12 +2,12 @@
#include "util.h"
#include <stdlib.h>
#include <stdio.h>
-#include <linux/atomic.h>
+#include <linux/refcount.h>
struct comm_str {
char *str;
struct rb_node rb_node;
- atomic_t refcnt;
+ refcount_t refcnt;
};
/* Should perhaps be moved to struct machine */
@@ -16,13 +16,13 @@ static struct rb_root comm_str_root;
static struct comm_str *comm_str__get(struct comm_str *cs)
{
if (cs)
- atomic_inc(&cs->refcnt);
+ refcount_inc(&cs->refcnt);
return cs;
}
static void comm_str__put(struct comm_str *cs)
{
- if (cs && atomic_dec_and_test(&cs->refcnt)) {
+ if (cs && refcount_dec_and_test(&cs->refcnt)) {
rb_erase(&cs->rb_node, &comm_str_root);
zfree(&cs->str);
free(cs);
@@ -43,7 +43,7 @@ static struct comm_str *comm_str__alloc(const char *str)
return NULL;
}
- atomic_set(&cs->refcnt, 0);
+ refcount_set(&cs->refcnt, 1);
return cs;
}
@@ -61,7 +61,7 @@ static struct comm_str *comm_str__findnew(const char *str, struct rb_root *root)
cmp = strcmp(str, iter->str);
if (!cmp)
- return iter;
+ return comm_str__get(iter);
if (cmp < 0)
p = &(*p)->rb_left;
@@ -95,8 +95,6 @@ struct comm *comm__new(const char *str, u64 timestamp, bool exec)
return NULL;
}
- comm_str__get(comm->comm_str);
-
return comm;
}
@@ -108,7 +106,6 @@ int comm__override(struct comm *comm, const char *str, u64 timestamp, bool exec)
if (!new)
return -ENOMEM;
- comm_str__get(new);
comm_str__put(old);
comm->comm_str = new;
comm->start = timestamp;
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index 0c7d5a4975cd..7b01d59076d3 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -627,6 +627,8 @@ static int perf_config_set__init(struct perf_config_set *set)
{
int ret = -1;
const char *home = NULL;
+ char *user_config;
+ struct stat st;
/* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
if (config_exclusive_filename)
@@ -637,35 +639,41 @@ static int perf_config_set__init(struct perf_config_set *set)
}
home = getenv("HOME");
- if (perf_config_global() && home) {
- char *user_config = strdup(mkpath("%s/.perfconfig", home));
- struct stat st;
- if (user_config == NULL) {
- warning("Not enough memory to process %s/.perfconfig, "
- "ignoring it.", home);
- goto out;
- }
+ /*
+ * Skip reading user config if:
+ * - there is no place to read it from (HOME)
+ * - we are asked not to (PERF_CONFIG_NOGLOBAL=1)
+ */
+ if (!home || !*home || !perf_config_global())
+ return 0;
- if (stat(user_config, &st) < 0) {
- if (errno == ENOENT)
- ret = 0;
- goto out_free;
- }
+ user_config = strdup(mkpath("%s/.perfconfig", home));
+ if (user_config == NULL) {
+ warning("Not enough memory to process %s/.perfconfig, "
+ "ignoring it.", home);
+ goto out;
+ }
+
+ if (stat(user_config, &st) < 0) {
+ if (errno == ENOENT)
+ ret = 0;
+ goto out_free;
+ }
- ret = 0;
+ ret = 0;
- if (st.st_uid && (st.st_uid != geteuid())) {
- warning("File %s not owned by current user or root, "
- "ignoring it.", user_config);
- goto out_free;
- }
+ if (st.st_uid && (st.st_uid != geteuid())) {
+ warning("File %s not owned by current user or root, "
+ "ignoring it.", user_config);
+ goto out_free;
+ }
+
+ if (st.st_size)
+ ret = perf_config_from_file(collect_config, user_config, set);
- if (st.st_size)
- ret = perf_config_from_file(collect_config, user_config, set);
out_free:
- free(user_config);
- }
+ free(user_config);
out:
return ret;
}
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 8c7504939113..061018b42393 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -29,7 +29,7 @@ static struct cpu_map *cpu_map__default_new(void)
cpus->map[i] = i;
cpus->nr = nr_cpus;
- atomic_set(&cpus->refcnt, 1);
+ refcount_set(&cpus->refcnt, 1);
}
return cpus;
@@ -43,7 +43,7 @@ static struct cpu_map *cpu_map__trim_new(int nr_cpus, int *tmp_cpus)
if (cpus != NULL) {
cpus->nr = nr_cpus;
memcpy(cpus->map, tmp_cpus, payload_size);
- atomic_set(&cpus->refcnt, 1);
+ refcount_set(&cpus->refcnt, 1);
}
return cpus;
@@ -252,7 +252,7 @@ struct cpu_map *cpu_map__dummy_new(void)
if (cpus != NULL) {
cpus->nr = 1;
cpus->map[0] = -1;
- atomic_set(&cpus->refcnt, 1);
+ refcount_set(&cpus->refcnt, 1);
}
return cpus;
@@ -269,7 +269,7 @@ struct cpu_map *cpu_map__empty_new(int nr)
for (i = 0; i < nr; i++)
cpus->map[i] = -1;
- atomic_set(&cpus->refcnt, 1);
+ refcount_set(&cpus->refcnt, 1);
}
return cpus;
@@ -278,7 +278,7 @@ struct cpu_map *cpu_map__empty_new(int nr)
static void cpu_map__delete(struct cpu_map *map)
{
if (map) {
- WARN_ONCE(atomic_read(&map->refcnt) != 0,
+ WARN_ONCE(refcount_read(&map->refcnt) != 0,
"cpu_map refcnt unbalanced\n");
free(map);
}
@@ -287,13 +287,13 @@ static void cpu_map__delete(struct cpu_map *map)
struct cpu_map *cpu_map__get(struct cpu_map *map)
{
if (map)
- atomic_inc(&map->refcnt);
+ refcount_inc(&map->refcnt);
return map;
}
void cpu_map__put(struct cpu_map *map)
{
- if (map && atomic_dec_and_test(&map->refcnt))
+ if (map && refcount_dec_and_test(&map->refcnt))
cpu_map__delete(map);
}
@@ -357,7 +357,7 @@ int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
/* ensure we process id in increasing order */
qsort(c->map, c->nr, sizeof(int), cmp_ids);
- atomic_set(&c->refcnt, 1);
+ refcount_set(&c->refcnt, 1);
*res = c;
return 0;
}
@@ -673,3 +673,49 @@ size_t cpu_map__snprint(struct cpu_map *map, char *buf, size_t size)
pr_debug("cpumask list: %s\n", buf);
return ret;
}
+
+static char hex_char(unsigned char val)
+{
+ if (val < 10)
+ return val + '0';
+ if (val < 16)
+ return val - 10 + 'a';
+ return '?';
+}
+
+size_t cpu_map__snprint_mask(struct cpu_map *map, char *buf, size_t size)
+{
+ int i, cpu;
+ char *ptr = buf;
+ unsigned char *bitmap;
+ int last_cpu = cpu_map__cpu(map, map->nr - 1);
+
+ bitmap = zalloc((last_cpu + 7) / 8);
+ if (bitmap == NULL) {
+ buf[0] = '\0';
+ return 0;
+ }
+
+ for (i = 0; i < map->nr; i++) {
+ cpu = cpu_map__cpu(map, i);
+ bitmap[cpu / 8] |= 1 << (cpu % 8);
+ }
+
+ for (cpu = last_cpu / 4 * 4; cpu >= 0; cpu -= 4) {
+ unsigned char bits = bitmap[cpu / 8];
+
+ if (cpu % 8)
+ bits >>= 4;
+ else
+ bits &= 0xf;
+
+ *ptr++ = hex_char(bits);
+ if ((cpu % 32) == 0 && cpu > 0)
+ *ptr++ = ',';
+ }
+ *ptr = '\0';
+ free(bitmap);
+
+ buf[size - 1] = '\0';
+ return ptr - buf;
+}
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 1a0549af8f5c..6b8bff87481d 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -3,13 +3,13 @@
#include <stdio.h>
#include <stdbool.h>
-#include <linux/atomic.h>
+#include <linux/refcount.h>
#include "perf.h"
#include "util/debug.h"
struct cpu_map {
- atomic_t refcnt;
+ refcount_t refcnt;
int nr;
int map[];
};
@@ -20,6 +20,7 @@ struct cpu_map *cpu_map__dummy_new(void);
struct cpu_map *cpu_map__new_data(struct cpu_map_data *data);
struct cpu_map *cpu_map__read(FILE *file);
size_t cpu_map__snprint(struct cpu_map *map, char *buf, size_t size);
+size_t cpu_map__snprint_mask(struct cpu_map *map, char *buf, size_t size);
size_t cpu_map__fprintf(struct cpu_map *map, FILE *fp);
int cpu_map__get_socket_id(int cpu);
int cpu_map__get_socket(struct cpu_map *map, int idx, void *data);
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 4e6cbc99f08e..89ece2445713 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -1468,6 +1468,7 @@ int bt_convert__perf2ctf(const char *input, const char *path,
.lost = perf_event__process_lost,
.tracing_data = perf_event__process_tracing_data,
.build_id = perf_event__process_build_id,
+ .namespaces = perf_event__process_namespaces,
.ordered_events = true,
.ordering_requires_timestamps = true,
},
diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index d38b62a700ca..42db00d78573 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -1109,7 +1109,7 @@ struct dso *dso__new(const char *name)
INIT_LIST_HEAD(&dso->node);
INIT_LIST_HEAD(&dso->data.open_entry);
pthread_mutex_init(&dso->lock, NULL);
- atomic_set(&dso->refcnt, 1);
+ refcount_set(&dso->refcnt, 1);
}
return dso;
@@ -1147,13 +1147,13 @@ void dso__delete(struct dso *dso)
struct dso *dso__get(struct dso *dso)
{
if (dso)
- atomic_inc(&dso->refcnt);
+ refcount_inc(&dso->refcnt);
return dso;
}
void dso__put(struct dso *dso)
{
- if (dso && atomic_dec_and_test(&dso->refcnt))
+ if (dso && refcount_dec_and_test(&dso->refcnt))
dso__delete(dso);
}
diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h
index ecc4bbd3f82e..12350b171727 100644
--- a/tools/perf/util/dso.h
+++ b/tools/perf/util/dso.h
@@ -1,7 +1,7 @@
#ifndef __PERF_DSO
#define __PERF_DSO
-#include <linux/atomic.h>
+#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/rbtree.h>
#include <sys/types.h>
@@ -187,7 +187,7 @@ struct dso {
void *priv;
u64 db_id;
};
- atomic_t refcnt;
+ refcount_t refcnt;
char name[0];
};
diff --git a/tools/perf/util/dump-insn.c b/tools/perf/util/dump-insn.c
new file mode 100644
index 000000000000..ffbdb19f05d0
--- /dev/null
+++ b/tools/perf/util/dump-insn.c
@@ -0,0 +1,14 @@
+#include <linux/compiler.h>
+#include "dump-insn.h"
+
+/* Fallback code */
+
+__weak
+const char *dump_insn(struct perf_insn *x __maybe_unused,
+ u64 ip __maybe_unused, u8 *inbuf __maybe_unused,
+ int inlen __maybe_unused, int *lenp)
+{
+ if (lenp)
+ *lenp = 0;
+ return "?";
+}
diff --git a/tools/perf/util/dump-insn.h b/tools/perf/util/dump-insn.h
new file mode 100644
index 000000000000..90fb115981cf
--- /dev/null
+++ b/tools/perf/util/dump-insn.h
@@ -0,0 +1,22 @@
+#ifndef __PERF_DUMP_INSN_H
+#define __PERF_DUMP_INSN_H 1
+
+#define MAXINSN 15
+
+#include <linux/types.h>
+
+struct thread;
+
+struct perf_insn {
+ /* Initialized by callers: */
+ struct thread *thread;
+ u8 cpumode;
+ bool is64bit;
+ int cpu;
+ /* Temporary */
+ char out[256];
+};
+
+const char *dump_insn(struct perf_insn *x, u64 ip,
+ u8 *inbuf, int inlen, int *lenp);
+#endif
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 4ea7ce72ed9c..76b9c6bc8369 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -31,6 +31,7 @@ static const char *perf_event__names[] = {
[PERF_RECORD_LOST_SAMPLES] = "LOST_SAMPLES",
[PERF_RECORD_SWITCH] = "SWITCH",
[PERF_RECORD_SWITCH_CPU_WIDE] = "SWITCH_CPU_WIDE",
+ [PERF_RECORD_NAMESPACES] = "NAMESPACES",
[PERF_RECORD_HEADER_ATTR] = "ATTR",
[PERF_RECORD_HEADER_EVENT_TYPE] = "EVENT_TYPE",
[PERF_RECORD_HEADER_TRACING_DATA] = "TRACING_DATA",
@@ -49,6 +50,16 @@ static const char *perf_event__names[] = {
[PERF_RECORD_TIME_CONV] = "TIME_CONV",
};
+static const char *perf_ns__names[] = {
+ [NET_NS_INDEX] = "net",
+ [UTS_NS_INDEX] = "uts",
+ [IPC_NS_INDEX] = "ipc",
+ [PID_NS_INDEX] = "pid",
+ [USER_NS_INDEX] = "user",
+ [MNT_NS_INDEX] = "mnt",
+ [CGROUP_NS_INDEX] = "cgroup",
+};
+
const char *perf_event__name(unsigned int id)
{
if (id >= ARRAY_SIZE(perf_event__names))
@@ -58,6 +69,13 @@ const char *perf_event__name(unsigned int id)
return perf_event__names[id];
}
+static const char *perf_ns__name(unsigned int id)
+{
+ if (id >= ARRAY_SIZE(perf_ns__names))
+ return "UNKNOWN";
+ return perf_ns__names[id];
+}
+
static int perf_tool__process_synth_event(struct perf_tool *tool,
union perf_event *event,
struct machine *machine,
@@ -203,6 +221,58 @@ pid_t perf_event__synthesize_comm(struct perf_tool *tool,
return tgid;
}
+static void perf_event__get_ns_link_info(pid_t pid, const char *ns,
+ struct perf_ns_link_info *ns_link_info)
+{
+ struct stat64 st;
+ char proc_ns[128];
+
+ sprintf(proc_ns, "/proc/%u/ns/%s", pid, ns);
+ if (stat64(proc_ns, &st) == 0) {
+ ns_link_info->dev = st.st_dev;
+ ns_link_info->ino = st.st_ino;
+ }
+}
+
+int perf_event__synthesize_namespaces(struct perf_tool *tool,
+ union perf_event *event,
+ pid_t pid, pid_t tgid,
+ perf_event__handler_t process,
+ struct machine *machine)
+{
+ u32 idx;
+ struct perf_ns_link_info *ns_link_info;
+
+ if (!tool || !tool->namespace_events)
+ return 0;
+
+ memset(&event->namespaces, 0, (sizeof(event->namespaces) +
+ (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+ machine->id_hdr_size));
+
+ event->namespaces.pid = tgid;
+ event->namespaces.tid = pid;
+
+ event->namespaces.nr_namespaces = NR_NAMESPACES;
+
+ ns_link_info = event->namespaces.link_info;
+
+ for (idx = 0; idx < event->namespaces.nr_namespaces; idx++)
+ perf_event__get_ns_link_info(pid, perf_ns__name(idx),
+ &ns_link_info[idx]);
+
+ event->namespaces.header.type = PERF_RECORD_NAMESPACES;
+
+ event->namespaces.header.size = (sizeof(event->namespaces) +
+ (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+ machine->id_hdr_size);
+
+ if (perf_tool__process_synth_event(tool, event, machine, process) != 0)
+ return -1;
+
+ return 0;
+}
+
static int perf_event__synthesize_fork(struct perf_tool *tool,
union perf_event *event,
pid_t pid, pid_t tgid, pid_t ppid,
@@ -255,8 +325,8 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
if (machine__is_default_guest(machine))
return 0;
- snprintf(filename, sizeof(filename), "%s/proc/%d/maps",
- machine->root_dir, pid);
+ snprintf(filename, sizeof(filename), "%s/proc/%d/task/%d/maps",
+ machine->root_dir, pid, pid);
fp = fopen(filename, "r");
if (fp == NULL) {
@@ -434,8 +504,9 @@ int perf_event__synthesize_modules(struct perf_tool *tool,
static int __event__synthesize_thread(union perf_event *comm_event,
union perf_event *mmap_event,
union perf_event *fork_event,
+ union perf_event *namespaces_event,
pid_t pid, int full,
- perf_event__handler_t process,
+ perf_event__handler_t process,
struct perf_tool *tool,
struct machine *machine,
bool mmap_data,
@@ -455,6 +526,11 @@ static int __event__synthesize_thread(union perf_event *comm_event,
if (tgid == -1)
return -1;
+ if (perf_event__synthesize_namespaces(tool, namespaces_event, pid,
+ tgid, process, machine) < 0)
+ return -1;
+
+
return perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid,
process, machine, mmap_data,
proc_map_timeout);
@@ -488,6 +564,11 @@ static int __event__synthesize_thread(union perf_event *comm_event,
if (perf_event__synthesize_fork(tool, fork_event, _pid, tgid,
ppid, process, machine) < 0)
break;
+
+ if (perf_event__synthesize_namespaces(tool, namespaces_event, _pid,
+ tgid, process, machine) < 0)
+ break;
+
/*
* Send the prepared comm event
*/
@@ -516,6 +597,7 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
unsigned int proc_map_timeout)
{
union perf_event *comm_event, *mmap_event, *fork_event;
+ union perf_event *namespaces_event;
int err = -1, thread, j;
comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
@@ -530,10 +612,16 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
if (fork_event == NULL)
goto out_free_mmap;
+ namespaces_event = malloc(sizeof(namespaces_event->namespaces) +
+ (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+ machine->id_hdr_size);
+ if (namespaces_event == NULL)
+ goto out_free_fork;
+
err = 0;
for (thread = 0; thread < threads->nr; ++thread) {
if (__event__synthesize_thread(comm_event, mmap_event,
- fork_event,
+ fork_event, namespaces_event,
thread_map__pid(threads, thread), 0,
process, tool, machine,
mmap_data, proc_map_timeout)) {
@@ -559,7 +647,7 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
/* if not, generate events for it */
if (need_leader &&
__event__synthesize_thread(comm_event, mmap_event,
- fork_event,
+ fork_event, namespaces_event,
comm_event->comm.pid, 0,
process, tool, machine,
mmap_data, proc_map_timeout)) {
@@ -568,6 +656,8 @@ int perf_event__synthesize_thread_map(struct perf_tool *tool,
}
}
}
+ free(namespaces_event);
+out_free_fork:
free(fork_event);
out_free_mmap:
free(mmap_event);
@@ -587,6 +677,7 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
char proc_path[PATH_MAX];
struct dirent *dirent;
union perf_event *comm_event, *mmap_event, *fork_event;
+ union perf_event *namespaces_event;
int err = -1;
if (machine__is_default_guest(machine))
@@ -604,11 +695,17 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
if (fork_event == NULL)
goto out_free_mmap;
+ namespaces_event = malloc(sizeof(namespaces_event->namespaces) +
+ (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
+ machine->id_hdr_size);
+ if (namespaces_event == NULL)
+ goto out_free_fork;
+
snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
proc = opendir(proc_path);
if (proc == NULL)
- goto out_free_fork;
+ goto out_free_namespaces;
while ((dirent = readdir(proc)) != NULL) {
char *end;
@@ -620,13 +717,16 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
* We may race with exiting thread, so don't stop just because
* one thread couldn't be synthesized.
*/
- __event__synthesize_thread(comm_event, mmap_event, fork_event, pid,
- 1, process, tool, machine, mmap_data,
+ __event__synthesize_thread(comm_event, mmap_event, fork_event,
+ namespaces_event, pid, 1, process,
+ tool, machine, mmap_data,
proc_map_timeout);
}
err = 0;
closedir(proc);
+out_free_namespaces:
+ free(namespaces_event);
out_free_fork:
free(fork_event);
out_free_mmap:
@@ -1008,6 +1108,33 @@ size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp)
return fprintf(fp, "%s: %s:%d/%d\n", s, event->comm.comm, event->comm.pid, event->comm.tid);
}
+size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp)
+{
+ size_t ret = 0;
+ struct perf_ns_link_info *ns_link_info;
+ u32 nr_namespaces, idx;
+
+ ns_link_info = event->namespaces.link_info;
+ nr_namespaces = event->namespaces.nr_namespaces;
+
+ ret += fprintf(fp, " %d/%d - nr_namespaces: %u\n\t\t[",
+ event->namespaces.pid,
+ event->namespaces.tid,
+ nr_namespaces);
+
+ for (idx = 0; idx < nr_namespaces; idx++) {
+ if (idx && (idx % 4 == 0))
+ ret += fprintf(fp, "\n\t\t ");
+
+ ret += fprintf(fp, "%u/%s: %" PRIu64 "/%#" PRIx64 "%s", idx,
+ perf_ns__name(idx), (u64)ns_link_info[idx].dev,
+ (u64)ns_link_info[idx].ino,
+ ((idx + 1) != nr_namespaces) ? ", " : "]\n");
+ }
+
+ return ret;
+}
+
int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
union perf_event *event,
struct perf_sample *sample,
@@ -1016,6 +1143,14 @@ int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
return machine__process_comm_event(machine, event, sample);
}
+int perf_event__process_namespaces(struct perf_tool *tool __maybe_unused,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct machine *machine)
+{
+ return machine__process_namespaces_event(machine, event, sample);
+}
+
int perf_event__process_lost(struct perf_tool *tool __maybe_unused,
union perf_event *event,
struct perf_sample *sample,
@@ -1153,11 +1288,12 @@ int perf_event__process_exit(struct perf_tool *tool __maybe_unused,
size_t perf_event__fprintf_aux(union perf_event *event, FILE *fp)
{
- return fprintf(fp, " offset: %#"PRIx64" size: %#"PRIx64" flags: %#"PRIx64" [%s%s]\n",
+ return fprintf(fp, " offset: %#"PRIx64" size: %#"PRIx64" flags: %#"PRIx64" [%s%s%s]\n",
event->aux.aux_offset, event->aux.aux_size,
event->aux.flags,
event->aux.flags & PERF_AUX_FLAG_TRUNCATED ? "T" : "",
- event->aux.flags & PERF_AUX_FLAG_OVERWRITE ? "O" : "");
+ event->aux.flags & PERF_AUX_FLAG_OVERWRITE ? "O" : "",
+ event->aux.flags & PERF_AUX_FLAG_PARTIAL ? "P" : "");
}
size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp)
@@ -1196,6 +1332,9 @@ size_t perf_event__fprintf(union perf_event *event, FILE *fp)
case PERF_RECORD_MMAP:
ret += perf_event__fprintf_mmap(event, fp);
break;
+ case PERF_RECORD_NAMESPACES:
+ ret += perf_event__fprintf_namespaces(event, fp);
+ break;
case PERF_RECORD_MMAP2:
ret += perf_event__fprintf_mmap2(event, fp);
break;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index c735c53a26f8..eb7a7b200737 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -39,6 +39,13 @@ struct comm_event {
char comm[16];
};
+struct namespaces_event {
+ struct perf_event_header header;
+ u32 pid, tid;
+ u64 nr_namespaces;
+ struct perf_ns_link_info link_info[];
+};
+
struct fork_event {
struct perf_event_header header;
u32 pid, ppid;
@@ -269,6 +276,7 @@ struct events_stats {
u64 total_lost;
u64 total_lost_samples;
u64 total_aux_lost;
+ u64 total_aux_partial;
u64 total_invalid_chains;
u32 nr_events[PERF_RECORD_HEADER_MAX];
u32 nr_non_filtered_samples;
@@ -485,6 +493,7 @@ union perf_event {
struct mmap_event mmap;
struct mmap2_event mmap2;
struct comm_event comm;
+ struct namespaces_event namespaces;
struct fork_event fork;
struct lost_event lost;
struct lost_samples_event lost_samples;
@@ -587,6 +596,10 @@ int perf_event__process_switch(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
struct machine *machine);
+int perf_event__process_namespaces(struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct machine *machine);
int perf_event__process_mmap(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample,
@@ -636,6 +649,12 @@ pid_t perf_event__synthesize_comm(struct perf_tool *tool,
perf_event__handler_t process,
struct machine *machine);
+int perf_event__synthesize_namespaces(struct perf_tool *tool,
+ union perf_event *event,
+ pid_t pid, pid_t tgid,
+ perf_event__handler_t process,
+ struct machine *machine);
+
int perf_event__synthesize_mmap_events(struct perf_tool *tool,
union perf_event *event,
pid_t pid, pid_t tgid,
@@ -653,6 +672,7 @@ size_t perf_event__fprintf_itrace_start(union perf_event *event, FILE *fp);
size_t perf_event__fprintf_switch(union perf_event *event, FILE *fp);
size_t perf_event__fprintf_thread_map(union perf_event *event, FILE *fp);
size_t perf_event__fprintf_cpu_map(union perf_event *event, FILE *fp);
+size_t perf_event__fprintf_namespaces(union perf_event *event, FILE *fp);
size_t perf_event__fprintf(union perf_event *event, FILE *fp);
u64 kallsyms__get_function_start(const char *kallsyms_filename,
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index b601f2814a30..50420cd35446 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -777,7 +777,7 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *md, bool check_messu
/*
* Check if event was unmapped due to a POLLHUP/POLLERR.
*/
- if (!atomic_read(&md->refcnt))
+ if (!refcount_read(&md->refcnt))
return NULL;
head = perf_mmap__read_head(md);
@@ -794,7 +794,7 @@ perf_mmap__read_backward(struct perf_mmap *md)
/*
* Check if event was unmapped due to a POLLHUP/POLLERR.
*/
- if (!atomic_read(&md->refcnt))
+ if (!refcount_read(&md->refcnt))
return NULL;
head = perf_mmap__read_head(md);
@@ -856,7 +856,7 @@ void perf_mmap__read_catchup(struct perf_mmap *md)
{
u64 head;
- if (!atomic_read(&md->refcnt))
+ if (!refcount_read(&md->refcnt))
return;
head = perf_mmap__read_head(md);
@@ -875,14 +875,14 @@ static bool perf_mmap__empty(struct perf_mmap *md)
static void perf_mmap__get(struct perf_mmap *map)
{
- atomic_inc(&map->refcnt);
+ refcount_inc(&map->refcnt);
}
static void perf_mmap__put(struct perf_mmap *md)
{
- BUG_ON(md->base && atomic_read(&md->refcnt) == 0);
+ BUG_ON(md->base && refcount_read(&md->refcnt) == 0);
- if (atomic_dec_and_test(&md->refcnt))
+ if (refcount_dec_and_test(&md->refcnt))
perf_mmap__munmap(md);
}
@@ -894,7 +894,7 @@ void perf_mmap__consume(struct perf_mmap *md, bool overwrite)
perf_mmap__write_tail(md, old);
}
- if (atomic_read(&md->refcnt) == 1 && perf_mmap__empty(md))
+ if (refcount_read(&md->refcnt) == 1 && perf_mmap__empty(md))
perf_mmap__put(md);
}
@@ -937,7 +937,7 @@ static void perf_mmap__munmap(struct perf_mmap *map)
munmap(map->base, perf_mmap__mmap_len(map));
map->base = NULL;
map->fd = -1;
- atomic_set(&map->refcnt, 0);
+ refcount_set(&map->refcnt, 0);
}
auxtrace_mmap__munmap(&map->auxtrace_mmap);
}
@@ -974,8 +974,19 @@ static struct perf_mmap *perf_evlist__alloc_mmap(struct perf_evlist *evlist)
if (!map)
return NULL;
- for (i = 0; i < evlist->nr_mmaps; i++)
+ for (i = 0; i < evlist->nr_mmaps; i++) {
map[i].fd = -1;
+ /*
+ * When the perf_mmap() call is made we grab one refcount, plus
+ * one extra to let perf_evlist__mmap_consume() get the last
+ * events after all real references (perf_mmap__get()) are
+ * dropped.
+ *
+ * Each PERF_EVENT_IOC_SET_OUTPUT points to this mmap and
+ * thus does perf_mmap__get() on it.
+ */
+ refcount_set(&map[i].refcnt, 0);
+ }
return map;
}
@@ -1001,7 +1012,7 @@ static int perf_mmap__mmap(struct perf_mmap *map,
* evlist layer can't just drop it when filtering events in
* perf_evlist__filter_pollfd().
*/
- atomic_set(&map->refcnt, 2);
+ refcount_set(&map->refcnt, 2);
map->prev = 0;
map->mask = mp->mask;
map->base = mmap(NULL, perf_mmap__mmap_len(map), mp->prot,
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 389b9ccdf8c7..39942995f537 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -1,7 +1,7 @@
#ifndef __PERF_EVLIST_H
#define __PERF_EVLIST_H 1
-#include <linux/atomic.h>
+#include <linux/refcount.h>
#include <linux/list.h>
#include <api/fd/array.h>
#include <stdio.h>
@@ -29,7 +29,7 @@ struct perf_mmap {
void *base;
int mask;
int fd;
- atomic_t refcnt;
+ refcount_t refcnt;
u64 prev;
struct auxtrace_mmap auxtrace_mmap;
char event_copy[PERF_SAMPLE_MAX_SIZE] __attribute__((aligned(8)));
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index ac59710b79e0..9dc7e2d6e48a 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -236,6 +236,10 @@ void perf_evsel__init(struct perf_evsel *evsel,
evsel->sample_size = __perf_evsel__sample_size(attr->sample_type);
perf_evsel__calc_id_pos(evsel);
evsel->cmdline_group_boundary = false;
+ evsel->metric_expr = NULL;
+ evsel->metric_name = NULL;
+ evsel->metric_events = NULL;
+ evsel->collect_stat = false;
}
struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
@@ -932,6 +936,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
attr->mmap2 = track && !perf_missing_features.mmap2;
attr->comm = track;
+ if (opts->record_namespaces)
+ attr->namespaces = track;
+
if (opts->record_switch_events)
attr->context_switch = track;
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 06ef6f29efa1..d101695c482c 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -131,6 +131,11 @@ struct perf_evsel {
bool cmdline_group_boundary;
struct list_head config_terms;
int bpf_fd;
+ bool merged_stat;
+ const char * metric_expr;
+ const char * metric_name;
+ struct perf_evsel **metric_events;
+ bool collect_stat;
};
union u64_swap {
diff --git a/tools/perf/util/expr.h b/tools/perf/util/expr.h
new file mode 100644
index 000000000000..9c2760a1a96e
--- /dev/null
+++ b/tools/perf/util/expr.h
@@ -0,0 +1,25 @@
+#ifndef PARSE_CTX_H
+#define PARSE_CTX_H 1
+
+#define EXPR_MAX_OTHER 8
+#define MAX_PARSE_ID EXPR_MAX_OTHER
+
+struct parse_id {
+ const char *name;
+ double val;
+};
+
+struct parse_ctx {
+ int num_ids;
+ struct parse_id ids[MAX_PARSE_ID];
+};
+
+void expr__ctx_init(struct parse_ctx *ctx);
+void expr__add_id(struct parse_ctx *ctx, const char *id, double val);
+#ifndef IN_EXPR_Y
+int expr__parse(double *final_val, struct parse_ctx *ctx, const char **pp);
+#endif
+int expr__find_other(const char *p, const char *one, const char ***other,
+ int *num_other);
+
+#endif
diff --git a/tools/perf/util/expr.y b/tools/perf/util/expr.y
new file mode 100644
index 000000000000..954556bea36e
--- /dev/null
+++ b/tools/perf/util/expr.y
@@ -0,0 +1,173 @@
+/* Simple expression parser */
+%{
+#include "util.h"
+#include "util/debug.h"
+#define IN_EXPR_Y 1
+#include "expr.h"
+#include <string.h>
+
+#define MAXIDLEN 256
+%}
+
+%pure-parser
+%parse-param { double *final_val }
+%parse-param { struct parse_ctx *ctx }
+%parse-param { const char **pp }
+%lex-param { const char **pp }
+
+%union {
+ double num;
+ char id[MAXIDLEN+1];
+}
+
+%token <num> NUMBER
+%token <id> ID
+%left '|'
+%left '^'
+%left '&'
+%left '-' '+'
+%left '*' '/' '%'
+%left NEG NOT
+%type <num> expr
+
+%{
+static int expr__lex(YYSTYPE *res, const char **pp);
+
+static void expr__error(double *final_val __maybe_unused,
+ struct parse_ctx *ctx __maybe_unused,
+ const char **pp __maybe_unused,
+ const char *s)
+{
+ pr_debug("%s\n", s);
+}
+
+static int lookup_id(struct parse_ctx *ctx, char *id, double *val)
+{
+ int i;
+
+ for (i = 0; i < ctx->num_ids; i++) {
+ if (!strcasecmp(ctx->ids[i].name, id)) {
+ *val = ctx->ids[i].val;
+ return 0;
+ }
+ }
+ return -1;
+}
+
+%}
+%%
+
+all_expr: expr { *final_val = $1; }
+ ;
+
+expr: NUMBER
+ | ID { if (lookup_id(ctx, $1, &$$) < 0) {
+ pr_debug("%s not found", $1);
+ YYABORT;
+ }
+ }
+ | expr '+' expr { $$ = $1 + $3; }
+ | expr '-' expr { $$ = $1 - $3; }
+ | expr '*' expr { $$ = $1 * $3; }
+ | expr '/' expr { if ($3 == 0) YYABORT; $$ = $1 / $3; }
+ | expr '%' expr { if ((long)$3 == 0) YYABORT; $$ = (long)$1 % (long)$3; }
+ | '-' expr %prec NEG { $$ = -$2; }
+ | '(' expr ')' { $$ = $2; }
+ ;
+
+%%
+
+static int expr__symbol(YYSTYPE *res, const char *p, const char **pp)
+{
+ char *dst = res->id;
+ const char *s = p;
+
+ while (isalnum(*p) || *p == '_' || *p == '.') {
+ if (p - s >= MAXIDLEN)
+ return -1;
+ *dst++ = *p++;
+ }
+ *dst = 0;
+ *pp = p;
+ return ID;
+}
+
+static int expr__lex(YYSTYPE *res, const char **pp)
+{
+ int tok;
+ const char *s;
+ const char *p = *pp;
+
+ while (isspace(*p))
+ p++;
+ s = p;
+ switch (*p++) {
+ case 'a' ... 'z':
+ case 'A' ... 'Z':
+ return expr__symbol(res, p - 1, pp);
+ case '0' ... '9': case '.':
+ res->num = strtod(s, (char **)&p);
+ tok = NUMBER;
+ break;
+ default:
+ tok = *s;
+ break;
+ }
+ *pp = p;
+ return tok;
+}
+
+/* Caller must make sure id is allocated */
+void expr__add_id(struct parse_ctx *ctx, const char *name, double val)
+{
+ int idx;
+ assert(ctx->num_ids < MAX_PARSE_ID);
+ idx = ctx->num_ids++;
+ ctx->ids[idx].name = name;
+ ctx->ids[idx].val = val;
+}
+
+void expr__ctx_init(struct parse_ctx *ctx)
+{
+ ctx->num_ids = 0;
+}
+
+int expr__find_other(const char *p, const char *one, const char ***other,
+ int *num_otherp)
+{
+ const char *orig = p;
+ int err = -1;
+ int num_other;
+
+ *other = malloc((EXPR_MAX_OTHER + 1) * sizeof(char *));
+ if (!*other)
+ return -1;
+
+ num_other = 0;
+ for (;;) {
+ YYSTYPE val;
+ int tok = expr__lex(&val, &p);
+ if (tok == 0) {
+ err = 0;
+ break;
+ }
+ if (tok == ID && strcasecmp(one, val.id)) {
+ if (num_other >= EXPR_MAX_OTHER - 1) {
+ pr_debug("Too many extra events in %s\n", orig);
+ break;
+ }
+ (*other)[num_other] = strdup(val.id);
+ if (!(*other)[num_other])
+ return -1;
+ num_other++;
+ }
+ }
+ (*other)[num_other] = NULL;
+ *num_otherp = num_other;
+ if (err) {
+ *num_otherp = 0;
+ free(*other);
+ *other = NULL;
+ }
+ return err;
+}
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 05714d548584..ef09f26e67da 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -370,15 +370,11 @@ static int write_cmdline(int fd, struct perf_header *h __maybe_unused,
struct perf_evlist *evlist __maybe_unused)
{
char buf[MAXPATHLEN];
- char proc[32];
u32 n;
int i, ret;
- /*
- * actual atual path to perf binary
- */
- sprintf(proc, "/proc/%d/exe", getpid());
- ret = readlink(proc, buf, sizeof(buf));
+ /* actual path to perf binary */
+ ret = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
if (ret <= 0)
return -1;
diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c
index 2821f8d77e52..34201440ac03 100644
--- a/tools/perf/util/help-unknown-cmd.c
+++ b/tools/perf/util/help-unknown-cmd.c
@@ -6,16 +6,12 @@
#include "levenshtein.h"
static int autocorrect;
-static struct cmdnames aliases;
static int perf_unknown_cmd_config(const char *var, const char *value,
void *cb __maybe_unused)
{
if (!strcmp(var, "help.autocorrect"))
autocorrect = perf_config_int(var,value);
- /* Also use aliases for command lookup */
- if (!prefixcmp(var, "alias."))
- add_cmdname(&aliases, var + 6, strlen(var + 6));
return 0;
}
@@ -59,14 +55,12 @@ const char *help_unknown_cmd(const char *cmd)
memset(&main_cmds, 0, sizeof(main_cmds));
memset(&other_cmds, 0, sizeof(main_cmds));
- memset(&aliases, 0, sizeof(aliases));
perf_config(perf_unknown_cmd_config, NULL);
load_command_list("perf-", &main_cmds, &other_cmds);
- if (add_cmd_list(&main_cmds, &aliases) < 0 ||
- add_cmd_list(&main_cmds, &other_cmds) < 0) {
+ if (add_cmd_list(&main_cmds, &other_cmds) < 0) {
fprintf(stderr, "ERROR: Failed to allocate command list for unknown command.\n");
goto end;
}
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index eaf72a938fb4..61bf304206fd 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -3,6 +3,7 @@
#include "hist.h"
#include "map.h"
#include "session.h"
+#include "namespaces.h"
#include "sort.h"
#include "evlist.h"
#include "evsel.h"
@@ -169,6 +170,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
hists__set_unres_dso_col_len(hists, HISTC_MEM_DADDR_DSO);
}
+ hists__new_col_len(hists, HISTC_CGROUP_ID, 20);
hists__new_col_len(hists, HISTC_CPU, 3);
hists__new_col_len(hists, HISTC_SOCKET, 6);
hists__new_col_len(hists, HISTC_MEM_LOCKED, 6);
@@ -574,9 +576,14 @@ __hists__add_entry(struct hists *hists,
bool sample_self,
struct hist_entry_ops *ops)
{
+ struct namespaces *ns = thread__namespaces(al->thread);
struct hist_entry entry = {
.thread = al->thread,
.comm = thread__comm(al->thread),
+ .cgroup_id = {
+ .dev = ns ? ns->link_info[CGROUP_NS_INDEX].dev : 0,
+ .ino = ns ? ns->link_info[CGROUP_NS_INDEX].ino : 0,
+ },
.ms = {
.map = al->map,
.sym = al->sym,
@@ -1129,6 +1136,11 @@ void hist_entry__delete(struct hist_entry *he)
zfree(&he->mem_info);
}
+ if (he->inline_node) {
+ inline_node__delete(he->inline_node);
+ he->inline_node = NULL;
+ }
+
zfree(&he->stat_acc);
free_srcline(he->srcline);
if (he->srcfile && he->srcfile[0])
@@ -2447,7 +2459,7 @@ int parse_filter_percentage(const struct option *opt __maybe_unused,
else if (!strcmp(arg, "absolute"))
symbol_conf.filter_relative = false;
else {
- pr_debug("Invalud percentage: %s\n", arg);
+ pr_debug("Invalid percentage: %s\n", arg);
return -1;
}
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 28c216e3d5b7..ee3670a388df 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -30,6 +30,7 @@ enum hist_column {
HISTC_DSO,
HISTC_THREAD,
HISTC_COMM,
+ HISTC_CGROUP_ID,
HISTC_PARENT,
HISTC_CPU,
HISTC_SOCKET,
@@ -57,6 +58,7 @@ enum hist_column {
HISTC_SRCLINE_FROM,
HISTC_SRCLINE_TO,
HISTC_TRACE,
+ HISTC_SYM_SIZE,
HISTC_NR_COLS, /* Last entry */
};
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
index 4f3c758d875d..54818828023b 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-insn-decoder.c
@@ -26,6 +26,7 @@
#include "insn.c"
#include "intel-pt-insn-decoder.h"
+#include "dump-insn.h"
#if INTEL_PT_INSN_BUF_SZ < MAX_INSN_SIZE || INTEL_PT_INSN_BUF_SZ > MAX_INSN
#error Instruction buffer size too small
@@ -39,6 +40,8 @@ static void intel_pt_insn_decoder(struct insn *insn,
enum intel_pt_insn_branch branch = INTEL_PT_BR_NO_BRANCH;
int ext;
+ intel_pt_insn->rel = 0;
+
if (insn_is_avx(insn)) {
intel_pt_insn->op = INTEL_PT_OP_OTHER;
intel_pt_insn->branch = INTEL_PT_BR_NO_BRANCH;
@@ -177,6 +180,29 @@ int intel_pt_get_insn(const unsigned char *buf, size_t len, int x86_64,
return 0;
}
+const char *dump_insn(struct perf_insn *x, uint64_t ip __maybe_unused,
+ u8 *inbuf, int inlen, int *lenp)
+{
+ struct insn insn;
+ int n, i;
+ int left;
+
+ insn_init(&insn, inbuf, inlen, x->is64bit);
+ insn_get_length(&insn);
+ if (!insn_complete(&insn) || insn.length > inlen)
+ return "<bad>";
+ if (lenp)
+ *lenp = insn.length;
+ left = sizeof(x->out);
+ n = snprintf(x->out, left, "insn: ");
+ left -= n;
+ for (i = 0; i < insn.length; i++) {
+ n += snprintf(x->out + n, left, "%02x ", inbuf[i]);
+ left -= n;
+ }
+ return x->out;
+}
+
const char *branch_name[] = {
[INTEL_PT_OP_OTHER] = "Other",
[INTEL_PT_OP_CALL] = "Call",
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 71c9720d4973..dfc600446586 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -13,6 +13,7 @@
#include <symbol/kallsyms.h>
#include "unwind.h"
#include "linux/hash.h"
+#include "asm/bug.h"
static void __machine__remove_thread(struct machine *machine, struct thread *th, bool lock);
@@ -501,6 +502,37 @@ int machine__process_comm_event(struct machine *machine, union perf_event *event
return err;
}
+int machine__process_namespaces_event(struct machine *machine __maybe_unused,
+ union perf_event *event,
+ struct perf_sample *sample __maybe_unused)
+{
+ struct thread *thread = machine__findnew_thread(machine,
+ event->namespaces.pid,
+ event->namespaces.tid);
+ int err = 0;
+
+ WARN_ONCE(event->namespaces.nr_namespaces > NR_NAMESPACES,
+ "\nWARNING: kernel seems to support more namespaces than perf"
+ " tool.\nTry updating the perf tool..\n\n");
+
+ WARN_ONCE(event->namespaces.nr_namespaces < NR_NAMESPACES,
+ "\nWARNING: perf tool seems to support more namespaces than"
+ " the kernel.\nTry updating the kernel..\n\n");
+
+ if (dump_trace)
+ perf_event__fprintf_namespaces(event, stdout);
+
+ if (thread == NULL ||
+ thread__set_namespaces(thread, sample->time, &event->namespaces)) {
+ dump_printf("problem processing PERF_RECORD_NAMESPACES, skipping event.\n");
+ err = -1;
+ }
+
+ thread__put(thread);
+
+ return err;
+}
+
int machine__process_lost_event(struct machine *machine __maybe_unused,
union perf_event *event, struct perf_sample *sample __maybe_unused)
{
@@ -1439,7 +1471,7 @@ static void __machine__remove_thread(struct machine *machine, struct thread *th,
if (machine->last_match == th)
machine->last_match = NULL;
- BUG_ON(atomic_read(&th->refcnt) == 0);
+ BUG_ON(refcount_read(&th->refcnt) == 0);
if (lock)
pthread_rwlock_wrlock(&machine->threads_lock);
rb_erase_init(&th->rb_node, &machine->threads);
@@ -1538,6 +1570,8 @@ int machine__process_event(struct machine *machine, union perf_event *event,
ret = machine__process_comm_event(machine, event, sample); break;
case PERF_RECORD_MMAP:
ret = machine__process_mmap_event(machine, event, sample); break;
+ case PERF_RECORD_NAMESPACES:
+ ret = machine__process_namespaces_event(machine, event, sample); break;
case PERF_RECORD_MMAP2:
ret = machine__process_mmap2_event(machine, event, sample); break;
case PERF_RECORD_FORK:
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index a28305029711..3cdb1340f917 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -97,6 +97,9 @@ int machine__process_itrace_start_event(struct machine *machine,
union perf_event *event);
int machine__process_switch_event(struct machine *machine,
union perf_event *event);
+int machine__process_namespaces_event(struct machine *machine,
+ union perf_event *event,
+ struct perf_sample *sample);
int machine__process_mmap_event(struct machine *machine, union perf_event *event,
struct perf_sample *sample);
int machine__process_mmap2_event(struct machine *machine, union perf_event *event,
diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 0a943e7b1ea7..c1870ac365a3 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -141,7 +141,7 @@ void map__init(struct map *map, enum map_type type,
RB_CLEAR_NODE(&map->rb_node);
map->groups = NULL;
map->erange_warned = false;
- atomic_set(&map->refcnt, 1);
+ refcount_set(&map->refcnt, 1);
}
struct map *map__new(struct machine *machine, u64 start, u64 len,
@@ -255,7 +255,7 @@ void map__delete(struct map *map)
void map__put(struct map *map)
{
- if (map && atomic_dec_and_test(&map->refcnt))
+ if (map && refcount_dec_and_test(&map->refcnt))
map__delete(map);
}
@@ -354,7 +354,7 @@ struct map *map__clone(struct map *from)
struct map *map = memdup(from, sizeof(*map));
if (map != NULL) {
- atomic_set(&map->refcnt, 1);
+ refcount_set(&map->refcnt, 1);
RB_CLEAR_NODE(&map->rb_node);
dso__get(map->dso);
map->groups = NULL;
@@ -405,7 +405,8 @@ int map__fprintf_srcline(struct map *map, u64 addr, const char *prefix,
if (map && map->dso) {
srcline = get_srcline(map->dso,
- map__rip_2objdump(map, addr), NULL, true);
+ map__rip_2objdump(map, addr), NULL,
+ true, true);
if (srcline != SRCLINE_UNKNOWN)
ret = fprintf(fp, "%s%s", prefix, srcline);
free_srcline(srcline);
@@ -485,7 +486,7 @@ void map_groups__init(struct map_groups *mg, struct machine *machine)
maps__init(&mg->maps[i]);
}
mg->machine = machine;
- atomic_set(&mg->refcnt, 1);
+ refcount_set(&mg->refcnt, 1);
}
static void __maps__purge(struct maps *maps)
@@ -547,7 +548,7 @@ void map_groups__delete(struct map_groups *mg)
void map_groups__put(struct map_groups *mg)
{
- if (mg && atomic_dec_and_test(&mg->refcnt))
+ if (mg && refcount_dec_and_test(&mg->refcnt))
map_groups__delete(mg);
}
diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h
index abdacf800c98..c8a5a644c0a9 100644
--- a/tools/perf/util/map.h
+++ b/tools/perf/util/map.h
@@ -1,7 +1,7 @@
#ifndef __PERF_MAP_H
#define __PERF_MAP_H
-#include <linux/atomic.h>
+#include <linux/refcount.h>
#include <linux/compiler.h>
#include <linux/list.h>
#include <linux/rbtree.h>
@@ -51,7 +51,7 @@ struct map {
struct dso *dso;
struct map_groups *groups;
- atomic_t refcnt;
+ refcount_t refcnt;
};
struct kmap {
@@ -67,7 +67,7 @@ struct maps {
struct map_groups {
struct maps maps[MAP__NR_TYPES];
struct machine *machine;
- atomic_t refcnt;
+ refcount_t refcnt;
};
struct map_groups *map_groups__new(struct machine *machine);
@@ -77,7 +77,7 @@ bool map_groups__empty(struct map_groups *mg);
static inline struct map_groups *map_groups__get(struct map_groups *mg)
{
if (mg)
- atomic_inc(&mg->refcnt);
+ refcount_inc(&mg->refcnt);
return mg;
}
@@ -150,7 +150,7 @@ struct map *map__clone(struct map *map);
static inline struct map *map__get(struct map *map)
{
if (map)
- atomic_inc(&map->refcnt);
+ refcount_inc(&map->refcnt);
return map;
}
diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c
new file mode 100644
index 000000000000..2de8da64d90c
--- /dev/null
+++ b/tools/perf/util/namespaces.c
@@ -0,0 +1,36 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright (C) 2017 Hari Bathini, IBM Corporation
+ */
+
+#include "namespaces.h"
+#include "util.h"
+#include "event.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+struct namespaces *namespaces__new(struct namespaces_event *event)
+{
+ struct namespaces *namespaces;
+ u64 link_info_size = ((event ? event->nr_namespaces : NR_NAMESPACES) *
+ sizeof(struct perf_ns_link_info));
+
+ namespaces = zalloc(sizeof(struct namespaces) + link_info_size);
+ if (!namespaces)
+ return NULL;
+
+ namespaces->end_time = -1;
+
+ if (event)
+ memcpy(namespaces->link_info, event->link_info, link_info_size);
+
+ return namespaces;
+}
+
+void namespaces__free(struct namespaces *namespaces)
+{
+ free(namespaces);
+}
diff --git a/tools/perf/util/namespaces.h b/tools/perf/util/namespaces.h
new file mode 100644
index 000000000000..468f1e9a1484
--- /dev/null
+++ b/tools/perf/util/namespaces.h
@@ -0,0 +1,26 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright (C) 2017 Hari Bathini, IBM Corporation
+ */
+
+#ifndef __PERF_NAMESPACES_H
+#define __PERF_NAMESPACES_H
+
+#include "../perf.h"
+#include <linux/list.h>
+
+struct namespaces_event;
+
+struct namespaces {
+ struct list_head list;
+ u64 end_time;
+ struct perf_ns_link_info link_info[];
+};
+
+struct namespaces *namespaces__new(struct namespaces_event *event);
+void namespaces__free(struct namespaces *namespaces);
+
+#endif /* __PERF_NAMESPACES_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 67a8aebc67ab..6b498aea9fde 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -316,8 +316,9 @@ __add_event(struct list_head *list, int *idx,
return NULL;
(*idx)++;
- evsel->cpus = cpu_map__get(cpus);
- evsel->own_cpus = cpu_map__get(cpus);
+ evsel->cpus = cpu_map__get(cpus);
+ evsel->own_cpus = cpu_map__get(cpus);
+ evsel->system_wide = !!cpus;
if (name)
evsel->name = strdup(name);
@@ -1254,11 +1255,59 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
evsel->scale = info.scale;
evsel->per_pkg = info.per_pkg;
evsel->snapshot = info.snapshot;
+ evsel->metric_expr = info.metric_expr;
+ evsel->metric_name = info.metric_name;
}
return evsel ? 0 : -ENOMEM;
}
+int parse_events_multi_pmu_add(struct parse_events_evlist *data,
+ char *str, struct list_head **listp)
+{
+ struct list_head *head;
+ struct parse_events_term *term;
+ struct list_head *list;
+ struct perf_pmu *pmu = NULL;
+ int ok = 0;
+
+ *listp = NULL;
+ /* Add it for all PMUs that support the alias */
+ list = malloc(sizeof(struct list_head));
+ if (!list)
+ return -1;
+ INIT_LIST_HEAD(list);
+ while ((pmu = perf_pmu__scan(pmu)) != NULL) {
+ struct perf_pmu_alias *alias;
+
+ list_for_each_entry(alias, &pmu->aliases, list) {
+ if (!strcasecmp(alias->name, str)) {
+ head = malloc(sizeof(struct list_head));
+ if (!head)
+ return -1;
+ INIT_LIST_HEAD(head);
+ if (parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
+ str, 1, false, &str, NULL) < 0)
+ return -1;
+ list_add_tail(&term->list, head);
+
+ if (!parse_events_add_pmu(data, list,
+ pmu->name, head)) {
+ pr_debug("%s -> %s/%s/\n", str,
+ pmu->name, alias->str);
+ ok++;
+ }
+
+ parse_events_terms__delete(head);
+ }
+ }
+ }
+ if (!ok)
+ return -1;
+ *listp = list;
+ return 0;
+}
+
int parse_events__modifier_group(struct list_head *list,
char *event_mod)
{
@@ -2276,7 +2325,7 @@ out_enomem:
* Print the help text for the event symbols:
*/
void print_events(const char *event_glob, bool name_only, bool quiet_flag,
- bool long_desc)
+ bool long_desc, bool details_flag)
{
print_symbol_events(event_glob, PERF_TYPE_HARDWARE,
event_symbols_hw, PERF_COUNT_HW_MAX, name_only);
@@ -2286,7 +2335,8 @@ void print_events(const char *event_glob, bool name_only, bool quiet_flag,
print_hwcache_events(event_glob, name_only);
- print_pmu_events(event_glob, name_only, quiet_flag, long_desc);
+ print_pmu_events(event_glob, name_only, quiet_flag, long_desc,
+ details_flag);
if (event_glob != NULL)
return;
@@ -2415,6 +2465,31 @@ int parse_events_term__clone(struct parse_events_term **new,
return new_term(new, &temp, term->val.str, term->val.num);
}
+int parse_events_copy_term_list(struct list_head *old,
+ struct list_head **new)
+{
+ struct parse_events_term *term, *n;
+ int ret;
+
+ if (!old) {
+ *new = NULL;
+ return 0;
+ }
+
+ *new = malloc(sizeof(struct list_head));
+ if (!*new)
+ return -ENOMEM;
+ INIT_LIST_HEAD(*new);
+
+ list_for_each_entry (term, old, list) {
+ ret = parse_events_term__clone(&n, term);
+ if (ret)
+ return ret;
+ list_add_tail(&n->list, *new);
+ }
+ return 0;
+}
+
void parse_events_terms__purge(struct list_head *terms)
{
struct parse_events_term *term, *h;
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 1af6a267c21b..a235f4d6d5e5 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -8,6 +8,7 @@
#include <stdbool.h>
#include <linux/types.h>
#include <linux/perf_event.h>
+#include <string.h>
struct list_head;
struct perf_evsel;
@@ -166,6 +167,14 @@ int parse_events_add_breakpoint(struct list_head *list, int *idx,
int parse_events_add_pmu(struct parse_events_evlist *data,
struct list_head *list, char *name,
struct list_head *head_config);
+
+int parse_events_multi_pmu_add(struct parse_events_evlist *data,
+ char *str,
+ struct list_head **listp);
+
+int parse_events_copy_term_list(struct list_head *old,
+ struct list_head **new);
+
enum perf_pmu_event_symbol_type
perf_pmu__parse_check(const char *name);
void parse_events__set_leader(char *name, struct list_head *list);
@@ -175,7 +184,7 @@ void parse_events_evlist_error(struct parse_events_evlist *data,
int idx, const char *str);
void print_events(const char *event_glob, bool name_only, bool quiet,
- bool long_desc);
+ bool long_desc, bool details_flag);
struct event_symbol {
const char *symbol;
@@ -196,4 +205,23 @@ int is_valid_tracepoint(const char *event_string);
int valid_event_mount(const char *eventfs);
char *parse_events_formats_error_string(char *additional_terms);
+#ifdef HAVE_LIBELF_SUPPORT
+/*
+ * If the probe point starts with '%',
+ * or starts with "sdt_" and has a ':' but no '=',
+ * then it should be a SDT/cached probe point.
+ */
+static inline bool is_sdt_event(char *str)
+{
+ return (str[0] == '%' ||
+ (!strncmp(str, "sdt_", 4) &&
+ !!strchr(str, ':') && !strchr(str, '=')));
+}
+#else
+static inline bool is_sdt_event(char *str __maybe_unused)
+{
+ return false;
+}
+#endif /* HAVE_LIBELF_SUPPORT */
+
#endif /* __PERF_PARSE_EVENTS_H */
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index 30f018ea1370..04fd8c9af9f9 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -226,68 +226,55 @@ event_pmu:
PE_NAME opt_event_config
{
struct parse_events_evlist *data = _data;
- struct list_head *list;
+ struct list_head *list, *orig_terms, *terms;
+
+ if (parse_events_copy_term_list($2, &orig_terms))
+ YYABORT;
ALLOC_LIST(list);
- ABORT_ON(parse_events_add_pmu(data, list, $1, $2));
+ if (parse_events_add_pmu(data, list, $1, $2)) {
+ struct perf_pmu *pmu = NULL;
+ int ok = 0;
+
+ while ((pmu = perf_pmu__scan(pmu)) != NULL) {
+ char *name = pmu->name;
+
+ if (!strncmp(name, "uncore_", 7) &&
+ strncmp($1, "uncore_", 7))
+ name += 7;
+ if (!strncmp($1, name, strlen($1))) {
+ if (parse_events_copy_term_list(orig_terms, &terms))
+ YYABORT;
+ if (!parse_events_add_pmu(data, list, pmu->name, terms))
+ ok++;
+ parse_events_terms__delete(terms);
+ }
+ }
+ if (!ok)
+ YYABORT;
+ }
parse_events_terms__delete($2);
+ parse_events_terms__delete(orig_terms);
$$ = list;
}
|
PE_KERNEL_PMU_EVENT sep_dc
{
- struct parse_events_evlist *data = _data;
- struct list_head *head;
- struct parse_events_term *term;
struct list_head *list;
- struct perf_pmu *pmu = NULL;
- int ok = 0;
-
- /* Add it for all PMUs that support the alias */
- ALLOC_LIST(list);
- while ((pmu = perf_pmu__scan(pmu)) != NULL) {
- struct perf_pmu_alias *alias;
-
- list_for_each_entry(alias, &pmu->aliases, list) {
- if (!strcasecmp(alias->name, $1)) {
- ALLOC_LIST(head);
- ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
- $1, 1, false, &@1, NULL));
- list_add_tail(&term->list, head);
-
- if (!parse_events_add_pmu(data, list,
- pmu->name, head)) {
- pr_debug("%s -> %s/%s/\n", $1,
- pmu->name, alias->str);
- ok++;
- }
- parse_events_terms__delete(head);
- }
- }
- }
- if (!ok)
+ if (parse_events_multi_pmu_add(_data, $1, &list) < 0)
YYABORT;
$$ = list;
}
|
PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
{
- struct parse_events_evlist *data = _data;
- struct list_head *head;
- struct parse_events_term *term;
struct list_head *list;
char pmu_name[128];
- snprintf(&pmu_name, 128, "%s-%s", $1, $3);
- ALLOC_LIST(head);
- ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
- &pmu_name, 1, false, &@1, NULL));
- list_add_tail(&term->list, head);
-
- ALLOC_LIST(list);
- ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
- parse_events_terms__delete(head);
+ snprintf(&pmu_name, 128, "%s-%s", $1, $3);
+ if (parse_events_multi_pmu_add(_data, pmu_name, &list) < 0)
+ YYABORT;
$$ = list;
}
diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c
index c4023f22f287..b2ae039eff85 100644
--- a/tools/perf/util/perf_regs.c
+++ b/tools/perf/util/perf_regs.c
@@ -6,6 +6,12 @@ const struct sample_reg __weak sample_reg_masks[] = {
SMPL_REG_END
};
+int __weak arch_sdt_arg_parse_op(char *old_op __maybe_unused,
+ char **new_op __maybe_unused)
+{
+ return SDT_ARG_SKIP;
+}
+
#ifdef HAVE_PERF_REGS_SUPPORT
int perf_reg_value(u64 *valp, struct regs_dump *regs, int id)
{
diff --git a/tools/perf/util/perf_regs.h b/tools/perf/util/perf_regs.h
index 679d6e493962..32b37d19dcc3 100644
--- a/tools/perf/util/perf_regs.h
+++ b/tools/perf/util/perf_regs.h
@@ -15,6 +15,13 @@ struct sample_reg {
extern const struct sample_reg sample_reg_masks[];
+enum {
+ SDT_ARG_VALID = 0,
+ SDT_ARG_SKIP,
+};
+
+int arch_sdt_arg_parse_op(char *old_op, char **new_op);
+
#ifdef HAVE_PERF_REGS_SUPPORT
#include <perf_regs.h>
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 12f84dd2ac5d..362051ea7f3d 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -231,7 +231,9 @@ static int perf_pmu__parse_snapshot(struct perf_pmu_alias *alias,
static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name,
char *desc, char *val,
char *long_desc, char *topic,
- char *unit, char *perpkg)
+ char *unit, char *perpkg,
+ char *metric_expr,
+ char *metric_name)
{
struct perf_pmu_alias *alias;
int ret;
@@ -265,6 +267,8 @@ static int __perf_pmu__new_alias(struct list_head *list, char *dir, char *name,
perf_pmu__parse_snapshot(alias, dir, name);
}
+ alias->metric_expr = metric_expr ? strdup(metric_expr) : NULL;
+ alias->metric_name = metric_name ? strdup(metric_name): NULL;
alias->desc = desc ? strdup(desc) : NULL;
alias->long_desc = long_desc ? strdup(long_desc) :
desc ? strdup(desc) : NULL;
@@ -294,7 +298,7 @@ static int perf_pmu__new_alias(struct list_head *list, char *dir, char *name, FI
buf[ret] = 0;
return __perf_pmu__new_alias(list, dir, name, NULL, buf, NULL, NULL, NULL,
- NULL);
+ NULL, NULL, NULL);
}
static inline bool pmu_alias_info_file(char *name)
@@ -564,7 +568,9 @@ static void pmu_add_cpu_aliases(struct list_head *head, const char *name)
__perf_pmu__new_alias(head, NULL, (char *)pe->name,
(char *)pe->desc, (char *)pe->event,
(char *)pe->long_desc, (char *)pe->topic,
- (char *)pe->unit, (char *)pe->perpkg);
+ (char *)pe->unit, (char *)pe->perpkg,
+ (char *)pe->metric_expr,
+ (char *)pe->metric_name);
}
out:
@@ -991,6 +997,8 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms,
info->unit = NULL;
info->scale = 0.0;
info->snapshot = false;
+ info->metric_expr = NULL;
+ info->metric_name = NULL;
list_for_each_entry_safe(term, h, head_terms, list) {
alias = pmu_find_alias(pmu, term);
@@ -1006,6 +1014,8 @@ int perf_pmu__check_alias(struct perf_pmu *pmu, struct list_head *head_terms,
if (alias->per_pkg)
info->per_pkg = true;
+ info->metric_expr = alias->metric_expr;
+ info->metric_name = alias->metric_name;
list_del(&term->list);
free(term);
@@ -1100,6 +1110,8 @@ struct sevent {
char *topic;
char *str;
char *pmu;
+ char *metric_expr;
+ char *metric_name;
};
static int cmp_sevent(const void *a, const void *b)
@@ -1142,7 +1154,7 @@ static void wordwrap(char *s, int start, int max, int corr)
}
void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag,
- bool long_desc)
+ bool long_desc, bool details_flag)
{
struct perf_pmu *pmu;
struct perf_pmu_alias *alias;
@@ -1198,6 +1210,8 @@ void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag,
aliases[j].topic = alias->topic;
aliases[j].str = alias->str;
aliases[j].pmu = pmu->name;
+ aliases[j].metric_expr = alias->metric_expr;
+ aliases[j].metric_name = alias->metric_name;
j++;
}
if (pmu->selectable &&
@@ -1232,8 +1246,14 @@ void print_pmu_events(const char *event_glob, bool name_only, bool quiet_flag,
printf("%*s", 8, "[");
wordwrap(aliases[j].desc, 8, columns, 0);
printf("]\n");
- if (verbose > 0)
- printf("%*s%s/%s/\n", 8, "", aliases[j].pmu, aliases[j].str);
+ if (details_flag) {
+ printf("%*s%s/%s/ ", 8, "", aliases[j].pmu, aliases[j].str);
+ if (aliases[j].metric_name)
+ printf(" MetricName: %s", aliases[j].metric_name);
+ if (aliases[j].metric_expr)
+ printf(" MetricExpr: %s", aliases[j].metric_expr);
+ putchar('\n');
+ }
} else
printf(" %-50s [Kernel PMU event]\n", aliases[j].name);
printed++;
diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h
index 00852ddc7741..ea7f450dc609 100644
--- a/tools/perf/util/pmu.h
+++ b/tools/perf/util/pmu.h
@@ -31,6 +31,8 @@ struct perf_pmu {
struct perf_pmu_info {
const char *unit;
+ const char *metric_expr;
+ const char *metric_name;
double scale;
bool per_pkg;
bool snapshot;
@@ -50,6 +52,8 @@ struct perf_pmu_alias {
double scale;
bool per_pkg;
bool snapshot;
+ char *metric_expr;
+ char *metric_name;
};
struct perf_pmu *perf_pmu__find(const char *name);
@@ -76,7 +80,7 @@ int perf_pmu__format_parse(char *dir, struct list_head *head);
struct perf_pmu *perf_pmu__scan(struct perf_pmu *pmu);
void print_pmu_events(const char *event_glob, bool name_only, bool quiet,
- bool long_desc);
+ bool long_desc, bool details_flag);
bool pmu_have_event(const char *pname, const char *name);
int perf_pmu__scan_file(struct perf_pmu *pmu, const char *name, const char *fmt,
diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index 28fb62c32678..e4b889444447 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -47,7 +47,6 @@
#include "probe-file.h"
#include "session.h"
-#define MAX_CMDLEN 256
#define PERFPROBE_GROUP "probe"
bool probe_event_dry_run; /* Dry run flag */
@@ -757,7 +756,9 @@ post_process_kernel_probe_trace_events(struct probe_trace_event *tevs,
}
for (i = 0; i < ntevs; i++) {
- if (!tevs[i].point.address || tevs[i].point.retprobe)
+ if (!tevs[i].point.address)
+ continue;
+ if (tevs[i].point.retprobe && !kretprobe_offset_is_supported())
continue;
/* If we found a wrong one, mark it by NULL symbol */
if (kprobe_warn_out_range(tevs[i].point.symbol,
@@ -1339,14 +1340,7 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
if (!arg)
return -EINVAL;
- /*
- * If the probe point starts with '%',
- * or starts with "sdt_" and has a ':' but no '=',
- * then it should be a SDT/cached probe point.
- */
- if (arg[0] == '%' ||
- (!strncmp(arg, "sdt_", 4) &&
- !!strchr(arg, ':') && !strchr(arg, '='))) {
+ if (is_sdt_event(arg)) {
pev->sdt = true;
if (arg[0] == '%')
arg++;
@@ -1528,11 +1522,6 @@ static int parse_perf_probe_point(char *arg, struct perf_probe_event *pev)
return -EINVAL;
}
- if (pp->retprobe && !pp->function) {
- semantic_error("Return probe requires an entry function.\n");
- return -EINVAL;
- }
-
if ((pp->offset || pp->line || pp->lazy_line) && pp->retprobe) {
semantic_error("Offset/Line/Lazy pattern can't be used with "
"return probe.\n");
@@ -2841,7 +2830,8 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev,
}
/* Note that the symbols in the kmodule are not relocated */
- if (!pev->uprobes && !pp->retprobe && !pev->target) {
+ if (!pev->uprobes && !pev->target &&
+ (!pp->retprobe || kretprobe_offset_is_supported())) {
reloc_sym = kernel_get_ref_reloc_sym();
if (!reloc_sym) {
pr_warning("Relocated base symbol is not found!\n");
@@ -3057,7 +3047,7 @@ concat_probe_trace_events(struct probe_trace_event **tevs, int *ntevs,
struct probe_trace_event *new_tevs;
int ret = 0;
- if (ntevs == 0) {
+ if (*ntevs == 0) {
*tevs = *tevs2;
*ntevs = ntevs2;
*tevs2 = NULL;
diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c
index 436b64731f65..88714dec8912 100644
--- a/tools/perf/util/probe-file.c
+++ b/tools/perf/util/probe-file.c
@@ -27,8 +27,10 @@
#include "probe-event.h"
#include "probe-file.h"
#include "session.h"
+#include "perf_regs.h"
-#define MAX_CMDLEN 256
+/* 4096 - 2 ('\n' + '\0') */
+#define MAX_CMDLEN 4094
static void print_open_warning(int err, bool uprobe)
{
@@ -70,7 +72,7 @@ static void print_both_open_warning(int kerr, int uerr)
}
}
-static int open_probe_events(const char *trace_file, bool readwrite)
+int open_trace_file(const char *trace_file, bool readwrite)
{
char buf[PATH_MAX];
int ret;
@@ -92,12 +94,12 @@ static int open_probe_events(const char *trace_file, bool readwrite)
static int open_kprobe_events(bool readwrite)
{
- return open_probe_events("kprobe_events", readwrite);
+ return open_trace_file("kprobe_events", readwrite);
}
static int open_uprobe_events(bool readwrite)
{
- return open_probe_events("uprobe_events", readwrite);
+ return open_trace_file("uprobe_events", readwrite);
}
int probe_file__open(int flag)
@@ -687,6 +689,110 @@ static unsigned long long sdt_note__get_addr(struct sdt_note *note)
: (unsigned long long)note->addr.a64[0];
}
+static const char * const type_to_suffix[] = {
+ ":s64", "", "", "", ":s32", "", ":s16", ":s8",
+ "", ":u8", ":u16", "", ":u32", "", "", "", ":u64"
+};
+
+/*
+ * Isolate the string number and convert it into a decimal value;
+ * this will be an index to get suffix of the uprobe name (defining
+ * the type)
+ */
+static int sdt_arg_parse_size(char *n_ptr, const char **suffix)
+{
+ long type_idx;
+
+ type_idx = strtol(n_ptr, NULL, 10);
+ if (type_idx < -8 || type_idx > 8) {
+ pr_debug4("Failed to get a valid sdt type\n");
+ return -1;
+ }
+
+ *suffix = type_to_suffix[type_idx + 8];
+ return 0;
+}
+
+static int synthesize_sdt_probe_arg(struct strbuf *buf, int i, const char *arg)
+{
+ char *op, *desc = strdup(arg), *new_op = NULL;
+ const char *suffix = "";
+ int ret = -1;
+
+ if (desc == NULL) {
+ pr_debug4("Allocation error\n");
+ return ret;
+ }
+
+ /*
+ * Argument is in N@OP format. N is size of the argument and OP is
+ * the actual assembly operand. N can be omitted; in that case
+ * argument is just OP(without @).
+ */
+ op = strchr(desc, '@');
+ if (op) {
+ op[0] = '\0';
+ op++;
+
+ if (sdt_arg_parse_size(desc, &suffix))
+ goto error;
+ } else {
+ op = desc;
+ }
+
+ ret = arch_sdt_arg_parse_op(op, &new_op);
+
+ if (ret < 0)
+ goto error;
+
+ if (ret == SDT_ARG_VALID) {
+ ret = strbuf_addf(buf, " arg%d=%s%s", i + 1, new_op, suffix);
+ if (ret < 0)
+ goto error;
+ }
+
+ ret = 0;
+error:
+ free(desc);
+ free(new_op);
+ return ret;
+}
+
+static char *synthesize_sdt_probe_command(struct sdt_note *note,
+ const char *pathname,
+ const char *sdtgrp)
+{
+ struct strbuf buf;
+ char *ret = NULL, **args;
+ int i, args_count;
+
+ if (strbuf_init(&buf, 32) < 0)
+ return NULL;
+
+ if (strbuf_addf(&buf, "p:%s/%s %s:0x%llx",
+ sdtgrp, note->name, pathname,
+ sdt_note__get_addr(note)) < 0)
+ goto error;
+
+ if (!note->args)
+ goto out;
+
+ if (note->args) {
+ args = argv_split(note->args, &args_count);
+
+ for (i = 0; i < args_count; ++i) {
+ if (synthesize_sdt_probe_arg(&buf, i, args[i]) < 0)
+ goto error;
+ }
+ }
+
+out:
+ ret = strbuf_detach(&buf, NULL);
+error:
+ strbuf_release(&buf);
+ return ret;
+}
+
int probe_cache__scan_sdt(struct probe_cache *pcache, const char *pathname)
{
struct probe_cache_entry *entry = NULL;
@@ -723,11 +829,12 @@ int probe_cache__scan_sdt(struct probe_cache *pcache, const char *pathname)
entry->pev.group = strdup(sdtgrp);
list_add_tail(&entry->node, &pcache->entries);
}
- ret = asprintf(&buf, "p:%s/%s %s:0x%llx",
- sdtgrp, note->name, pathname,
- sdt_note__get_addr(note));
- if (ret < 0)
+ buf = synthesize_sdt_probe_command(note, pathname, sdtgrp);
+ if (!buf) {
+ ret = -ENOMEM;
break;
+ }
+
strlist__add(entry->tevlist, buf);
free(buf);
entry = NULL;
@@ -877,59 +984,72 @@ int probe_cache__show_all_caches(struct strfilter *filter)
return 0;
}
+enum ftrace_readme {
+ FTRACE_README_PROBE_TYPE_X = 0,
+ FTRACE_README_KRETPROBE_OFFSET,
+ FTRACE_README_END,
+};
+
static struct {
const char *pattern;
- bool avail;
- bool checked;
-} probe_type_table[] = {
-#define DEFINE_TYPE(idx, pat, def_avail) \
- [idx] = {.pattern = pat, .avail = (def_avail)}
- DEFINE_TYPE(PROBE_TYPE_U, "* u8/16/32/64,*", true),
- DEFINE_TYPE(PROBE_TYPE_S, "* s8/16/32/64,*", true),
- DEFINE_TYPE(PROBE_TYPE_X, "* x8/16/32/64,*", false),
- DEFINE_TYPE(PROBE_TYPE_STRING, "* string,*", true),
- DEFINE_TYPE(PROBE_TYPE_BITFIELD,
- "* b<bit-width>@<bit-offset>/<container-size>", true),
+ bool avail;
+} ftrace_readme_table[] = {
+#define DEFINE_TYPE(idx, pat) \
+ [idx] = {.pattern = pat, .avail = false}
+ DEFINE_TYPE(FTRACE_README_PROBE_TYPE_X, "*type: * x8/16/32/64,*"),
+ DEFINE_TYPE(FTRACE_README_KRETPROBE_OFFSET, "*place (kretprobe): *"),
};
-bool probe_type_is_available(enum probe_type type)
+static bool scan_ftrace_readme(enum ftrace_readme type)
{
+ int fd;
FILE *fp;
char *buf = NULL;
size_t len = 0;
- bool target_line = false;
- bool ret = probe_type_table[type].avail;
+ bool ret = false;
+ static bool scanned = false;
- if (type >= PROBE_TYPE_END)
- return false;
- /* We don't have to check the type which supported by default */
- if (ret || probe_type_table[type].checked)
- return ret;
+ if (scanned)
+ goto result;
- if (asprintf(&buf, "%s/README", tracing_path) < 0)
+ fd = open_trace_file("README", false);
+ if (fd < 0)
return ret;
- fp = fopen(buf, "r");
- if (!fp)
- goto end;
-
- zfree(&buf);
- while (getline(&buf, &len, fp) > 0 && !ret) {
- if (!target_line) {
- target_line = !!strstr(buf, " type: ");
- if (!target_line)
- continue;
- } else if (strstr(buf, "\t ") != buf)
- break;
- ret = strglobmatch(buf, probe_type_table[type].pattern);
+ fp = fdopen(fd, "r");
+ if (!fp) {
+ close(fd);
+ return ret;
}
- /* Cache the result */
- probe_type_table[type].checked = true;
- probe_type_table[type].avail = ret;
+
+ while (getline(&buf, &len, fp) > 0)
+ for (enum ftrace_readme i = 0; i < FTRACE_README_END; i++)
+ if (!ftrace_readme_table[i].avail)
+ ftrace_readme_table[i].avail =
+ strglobmatch(buf, ftrace_readme_table[i].pattern);
+ scanned = true;
fclose(fp);
-end:
free(buf);
- return ret;
+result:
+ if (type >= FTRACE_README_END)
+ return false;
+
+ return ftrace_readme_table[type].avail;
+}
+
+bool probe_type_is_available(enum probe_type type)
+{
+ if (type >= PROBE_TYPE_END)
+ return false;
+ else if (type == PROBE_TYPE_X)
+ return scan_ftrace_readme(FTRACE_README_PROBE_TYPE_X);
+
+ return true;
+}
+
+bool kretprobe_offset_is_supported(void)
+{
+ return scan_ftrace_readme(FTRACE_README_KRETPROBE_OFFSET);
}
diff --git a/tools/perf/util/probe-file.h b/tools/perf/util/probe-file.h
index eba44c3e9dca..dbf95a00864a 100644
--- a/tools/perf/util/probe-file.h
+++ b/tools/perf/util/probe-file.h
@@ -35,6 +35,7 @@ enum probe_type {
/* probe-file.c depends on libelf */
#ifdef HAVE_LIBELF_SUPPORT
+int open_trace_file(const char *trace_file, bool readwrite);
int probe_file__open(int flag);
int probe_file__open_both(int *kfd, int *ufd, int flag);
struct strlist *probe_file__get_namelist(int fd);
@@ -64,6 +65,7 @@ struct probe_cache_entry *probe_cache__find_by_name(struct probe_cache *pcache,
const char *group, const char *event);
int probe_cache__show_all_caches(struct strfilter *filter);
bool probe_type_is_available(enum probe_type type);
+bool kretprobe_offset_is_supported(void);
#else /* ! HAVE_LIBELF_SUPPORT */
static inline struct probe_cache *probe_cache__new(const char *tgt __maybe_unused)
{
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 1dd617d116b5..24259bc2c598 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1,5 +1,6 @@
#include <linux/kernel.h>
#include <traceevent/event-parse.h>
+#include <api/fs/fs.h>
#include <byteswap.h>
#include <unistd.h>
@@ -1239,6 +1240,8 @@ static int machines__deliver_event(struct machines *machines,
return tool->mmap2(tool, event, sample, machine);
case PERF_RECORD_COMM:
return tool->comm(tool, event, sample, machine);
+ case PERF_RECORD_NAMESPACES:
+ return tool->namespaces(tool, event, sample, machine);
case PERF_RECORD_FORK:
return tool->fork(tool, event, sample, machine);
case PERF_RECORD_EXIT:
@@ -1258,9 +1261,12 @@ static int machines__deliver_event(struct machines *machines,
case PERF_RECORD_UNTHROTTLE:
return tool->unthrottle(tool, event, sample, machine);
case PERF_RECORD_AUX:
- if (tool->aux == perf_event__process_aux &&
- (event->aux.flags & PERF_AUX_FLAG_TRUNCATED))
- evlist->stats.total_aux_lost += 1;
+ if (tool->aux == perf_event__process_aux) {
+ if (event->aux.flags & PERF_AUX_FLAG_TRUNCATED)
+ evlist->stats.total_aux_lost += 1;
+ if (event->aux.flags & PERF_AUX_FLAG_PARTIAL)
+ evlist->stats.total_aux_partial += 1;
+ }
return tool->aux(tool, event, sample, machine);
case PERF_RECORD_ITRACE_START:
return tool->itrace_start(tool, event, sample, machine);
@@ -1494,6 +1500,11 @@ int perf_session__register_idle_thread(struct perf_session *session)
err = -1;
}
+ if (thread == NULL || thread__set_namespaces(thread, 0, NULL)) {
+ pr_err("problem inserting idle task.\n");
+ err = -1;
+ }
+
/* machine__findnew_thread() got the thread, so put it */
thread__put(thread);
return err;
@@ -1548,6 +1559,23 @@ static void perf_session__warn_about_errors(const struct perf_session *session)
stats->nr_events[PERF_RECORD_AUX]);
}
+ if (session->tool->aux == perf_event__process_aux &&
+ stats->total_aux_partial != 0) {
+ bool vmm_exclusive = false;
+
+ (void)sysfs__read_bool("module/kvm_intel/parameters/vmm_exclusive",
+ &vmm_exclusive);
+
+ ui__warning("AUX data had gaps in it %" PRIu64 " times out of %u!\n\n"
+ "Are you running a KVM guest in the background?%s\n\n",
+ stats->total_aux_partial,
+ stats->nr_events[PERF_RECORD_AUX],
+ vmm_exclusive ?
+ "\nReloading kvm_intel module with vmm_exclusive=0\n"
+ "will reduce the gaps to only guest's timeslices." :
+ "");
+ }
+
if (stats->nr_unknown_events != 0) {
ui__warning("Found %u unknown events!\n\n"
"Is this an older tool processing a perf.data "
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 0ff622288d24..73f3ec1cf2a0 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -323,7 +323,7 @@ char *hist_entry__get_srcline(struct hist_entry *he)
return SRCLINE_UNKNOWN;
return get_srcline(map->dso, map__rip_2objdump(map, he->ip),
- he->ms.sym, true);
+ he->ms.sym, true, true);
}
static int64_t
@@ -366,7 +366,8 @@ sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
left->branch_info->srcline_from = get_srcline(map->dso,
map__rip_2objdump(map,
left->branch_info->from.al_addr),
- left->branch_info->from.sym, true);
+ left->branch_info->from.sym,
+ true, true);
}
if (!right->branch_info->srcline_from) {
struct map *map = right->branch_info->from.map;
@@ -376,7 +377,8 @@ sort__srcline_from_cmp(struct hist_entry *left, struct hist_entry *right)
right->branch_info->srcline_from = get_srcline(map->dso,
map__rip_2objdump(map,
right->branch_info->from.al_addr),
- right->branch_info->from.sym, true);
+ right->branch_info->from.sym,
+ true, true);
}
return strcmp(right->branch_info->srcline_from, left->branch_info->srcline_from);
}
@@ -407,7 +409,8 @@ sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
left->branch_info->srcline_to = get_srcline(map->dso,
map__rip_2objdump(map,
left->branch_info->to.al_addr),
- left->branch_info->from.sym, true);
+ left->branch_info->from.sym,
+ true, true);
}
if (!right->branch_info->srcline_to) {
struct map *map = right->branch_info->to.map;
@@ -417,7 +420,8 @@ sort__srcline_to_cmp(struct hist_entry *left, struct hist_entry *right)
right->branch_info->srcline_to = get_srcline(map->dso,
map__rip_2objdump(map,
right->branch_info->to.al_addr),
- right->branch_info->to.sym, true);
+ right->branch_info->to.sym,
+ true, true);
}
return strcmp(right->branch_info->srcline_to, left->branch_info->srcline_to);
}
@@ -448,7 +452,7 @@ static char *hist_entry__get_srcfile(struct hist_entry *e)
return no_srcfile;
sf = __get_srcline(map->dso, map__rip_2objdump(map, e->ip),
- e->ms.sym, false, true);
+ e->ms.sym, false, true, true);
if (!strcmp(sf, SRCLINE_UNKNOWN))
return no_srcfile;
p = strchr(sf, ':');
@@ -536,6 +540,46 @@ struct sort_entry sort_cpu = {
.se_width_idx = HISTC_CPU,
};
+/* --sort cgroup_id */
+
+static int64_t _sort__cgroup_dev_cmp(u64 left_dev, u64 right_dev)
+{
+ return (int64_t)(right_dev - left_dev);
+}
+
+static int64_t _sort__cgroup_inode_cmp(u64 left_ino, u64 right_ino)
+{
+ return (int64_t)(right_ino - left_ino);
+}
+
+static int64_t
+sort__cgroup_id_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+ int64_t ret;
+
+ ret = _sort__cgroup_dev_cmp(right->cgroup_id.dev, left->cgroup_id.dev);
+ if (ret != 0)
+ return ret;
+
+ return _sort__cgroup_inode_cmp(right->cgroup_id.ino,
+ left->cgroup_id.ino);
+}
+
+static int hist_entry__cgroup_id_snprintf(struct hist_entry *he,
+ char *bf, size_t size,
+ unsigned int width __maybe_unused)
+{
+ return repsep_snprintf(bf, size, "%lu/0x%lx", he->cgroup_id.dev,
+ he->cgroup_id.ino);
+}
+
+struct sort_entry sort_cgroup_id = {
+ .se_header = "cgroup id (dev/inode)",
+ .se_cmp = sort__cgroup_id_cmp,
+ .se_snprintf = hist_entry__cgroup_id_snprintf,
+ .se_width_idx = HISTC_CGROUP_ID,
+};
+
/* --sort socket */
static int64_t
@@ -846,6 +890,9 @@ static int hist_entry__mispredict_snprintf(struct hist_entry *he, char *bf,
static int64_t
sort__cycles_cmp(struct hist_entry *left, struct hist_entry *right)
{
+ if (!left->branch_info || !right->branch_info)
+ return cmp_null(left->branch_info, right->branch_info);
+
return left->branch_info->flags.cycles -
right->branch_info->flags.cycles;
}
@@ -853,6 +900,8 @@ sort__cycles_cmp(struct hist_entry *left, struct hist_entry *right)
static int hist_entry__cycles_snprintf(struct hist_entry *he, char *bf,
size_t size, unsigned int width)
{
+ if (!he->branch_info)
+ return scnprintf(bf, size, "%-.*s", width, "N/A");
if (he->branch_info->flags.cycles == 0)
return repsep_snprintf(bf, size, "%-*s", width, "-");
return repsep_snprintf(bf, size, "%-*hd", width,
@@ -1396,6 +1445,46 @@ struct sort_entry sort_transaction = {
.se_width_idx = HISTC_TRANSACTION,
};
+/* --sort symbol_size */
+
+static int64_t _sort__sym_size_cmp(struct symbol *sym_l, struct symbol *sym_r)
+{
+ int64_t size_l = sym_l != NULL ? symbol__size(sym_l) : 0;
+ int64_t size_r = sym_r != NULL ? symbol__size(sym_r) : 0;
+
+ return size_l < size_r ? -1 :
+ size_l == size_r ? 0 : 1;
+}
+
+static int64_t
+sort__sym_size_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+ return _sort__sym_size_cmp(right->ms.sym, left->ms.sym);
+}
+
+static int _hist_entry__sym_size_snprintf(struct symbol *sym, char *bf,
+ size_t bf_size, unsigned int width)
+{
+ if (sym)
+ return repsep_snprintf(bf, bf_size, "%*d", width, symbol__size(sym));
+
+ return repsep_snprintf(bf, bf_size, "%*s", width, "unknown");
+}
+
+static int hist_entry__sym_size_snprintf(struct hist_entry *he, char *bf,
+ size_t size, unsigned int width)
+{
+ return _hist_entry__sym_size_snprintf(he->ms.sym, bf, size, width);
+}
+
+struct sort_entry sort_sym_size = {
+ .se_header = "Symbol size",
+ .se_cmp = sort__sym_size_cmp,
+ .se_snprintf = hist_entry__sym_size_snprintf,
+ .se_width_idx = HISTC_SYM_SIZE,
+};
+
+
struct sort_dimension {
const char *name;
struct sort_entry *entry;
@@ -1418,6 +1507,8 @@ static struct sort_dimension common_sort_dimensions[] = {
DIM(SORT_GLOBAL_WEIGHT, "weight", sort_global_weight),
DIM(SORT_TRANSACTION, "transaction", sort_transaction),
DIM(SORT_TRACE, "trace", sort_trace),
+ DIM(SORT_SYM_SIZE, "symbol_size", sort_sym_size),
+ DIM(SORT_CGROUP_ID, "cgroup_id", sort_cgroup_id),
};
#undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 796c847e2f00..e35fb186d048 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -54,6 +54,11 @@ struct he_stat {
u32 nr_events;
};
+struct namespace_id {
+ u64 dev;
+ u64 ino;
+};
+
struct hist_entry_diff {
bool computed;
union {
@@ -91,6 +96,7 @@ struct hist_entry {
struct map_symbol ms;
struct thread *thread;
struct comm *comm;
+ struct namespace_id cgroup_id;
u64 ip;
u64 transaction;
s32 socket;
@@ -122,6 +128,7 @@ struct hist_entry {
};
char *srcline;
char *srcfile;
+ struct inline_node *inline_node;
struct symbol *parent;
struct branch_info *branch_info;
struct hists *hists;
@@ -211,6 +218,8 @@ enum sort_type {
SORT_GLOBAL_WEIGHT,
SORT_TRANSACTION,
SORT_TRACE,
+ SORT_SYM_SIZE,
+ SORT_CGROUP_ID,
/* branch stack specific sort keys */
__SORT_BRANCH_STACK,
diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c
index b4db3f48e3b0..778ccb5d99d1 100644
--- a/tools/perf/util/srcline.c
+++ b/tools/perf/util/srcline.c
@@ -7,11 +7,58 @@
#include "util/dso.h"
#include "util/util.h"
#include "util/debug.h"
+#include "util/callchain.h"
#include "symbol.h"
bool srcline_full_filename;
+static const char *dso__name(struct dso *dso)
+{
+ const char *dso_name;
+
+ if (dso->symsrc_filename)
+ dso_name = dso->symsrc_filename;
+ else
+ dso_name = dso->long_name;
+
+ if (dso_name[0] == '[')
+ return NULL;
+
+ if (!strncmp(dso_name, "/tmp/perf-", 10))
+ return NULL;
+
+ return dso_name;
+}
+
+static int inline_list__append(char *filename, char *funcname, int line_nr,
+ struct inline_node *node, struct dso *dso)
+{
+ struct inline_list *ilist;
+ char *demangled;
+
+ ilist = zalloc(sizeof(*ilist));
+ if (ilist == NULL)
+ return -1;
+
+ ilist->filename = filename;
+ ilist->line_nr = line_nr;
+
+ if (dso != NULL) {
+ demangled = dso__demangle_sym(dso, 0, funcname);
+ if (demangled == NULL) {
+ ilist->funcname = funcname;
+ } else {
+ ilist->funcname = demangled;
+ free(funcname);
+ }
+ }
+
+ list_add_tail(&ilist->list, &node->val);
+
+ return 0;
+}
+
#ifdef HAVE_LIBBFD_SUPPORT
/*
@@ -151,9 +198,17 @@ static void addr2line_cleanup(struct a2l_data *a2l)
#define MAX_INLINE_NEST 1024
+static void inline_list__reverse(struct inline_node *node)
+{
+ struct inline_list *ilist, *n;
+
+ list_for_each_entry_safe_reverse(ilist, n, &node->val, list)
+ list_move_tail(&ilist->list, &node->val);
+}
+
static int addr2line(const char *dso_name, u64 addr,
char **file, unsigned int *line, struct dso *dso,
- bool unwind_inlines)
+ bool unwind_inlines, struct inline_node *node)
{
int ret = 0;
struct a2l_data *a2l = dso->a2l;
@@ -178,8 +233,21 @@ static int addr2line(const char *dso_name, u64 addr,
while (bfd_find_inliner_info(a2l->abfd, &a2l->filename,
&a2l->funcname, &a2l->line) &&
- cnt++ < MAX_INLINE_NEST)
- ;
+ cnt++ < MAX_INLINE_NEST) {
+
+ if (node != NULL) {
+ if (inline_list__append(strdup(a2l->filename),
+ strdup(a2l->funcname),
+ a2l->line, node,
+ dso) != 0)
+ return 0;
+ }
+ }
+
+ if ((node != NULL) &&
+ (callchain_param.order != ORDER_CALLEE)) {
+ inline_list__reverse(node);
+ }
}
if (a2l->found && a2l->filename) {
@@ -205,18 +273,68 @@ void dso__free_a2l(struct dso *dso)
dso->a2l = NULL;
}
+static struct inline_node *addr2inlines(const char *dso_name, u64 addr,
+ struct dso *dso)
+{
+ char *file = NULL;
+ unsigned int line = 0;
+ struct inline_node *node;
+
+ node = zalloc(sizeof(*node));
+ if (node == NULL) {
+ perror("not enough memory for the inline node");
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&node->val);
+ node->addr = addr;
+
+ if (!addr2line(dso_name, addr, &file, &line, dso, TRUE, node))
+ goto out_free_inline_node;
+
+ if (list_empty(&node->val))
+ goto out_free_inline_node;
+
+ return node;
+
+out_free_inline_node:
+ inline_node__delete(node);
+ return NULL;
+}
+
#else /* HAVE_LIBBFD_SUPPORT */
+static int filename_split(char *filename, unsigned int *line_nr)
+{
+ char *sep;
+
+ sep = strchr(filename, '\n');
+ if (sep)
+ *sep = '\0';
+
+ if (!strcmp(filename, "??:0"))
+ return 0;
+
+ sep = strchr(filename, ':');
+ if (sep) {
+ *sep++ = '\0';
+ *line_nr = strtoul(sep, NULL, 0);
+ return 1;
+ }
+
+ return 0;
+}
+
static int addr2line(const char *dso_name, u64 addr,
char **file, unsigned int *line_nr,
struct dso *dso __maybe_unused,
- bool unwind_inlines __maybe_unused)
+ bool unwind_inlines __maybe_unused,
+ struct inline_node *node __maybe_unused)
{
FILE *fp;
char cmd[PATH_MAX];
char *filename = NULL;
size_t len;
- char *sep;
int ret = 0;
scnprintf(cmd, sizeof(cmd), "addr2line -e %s %016"PRIx64,
@@ -233,23 +351,14 @@ static int addr2line(const char *dso_name, u64 addr,
goto out;
}
- sep = strchr(filename, '\n');
- if (sep)
- *sep = '\0';
-
- if (!strcmp(filename, "??:0")) {
- pr_debug("no debugging info in %s\n", dso_name);
+ ret = filename_split(filename, line_nr);
+ if (ret != 1) {
free(filename);
goto out;
}
- sep = strchr(filename, ':');
- if (sep) {
- *sep++ = '\0';
- *file = filename;
- *line_nr = strtoul(sep, NULL, 0);
- ret = 1;
- }
+ *file = filename;
+
out:
pclose(fp);
return ret;
@@ -259,6 +368,58 @@ void dso__free_a2l(struct dso *dso __maybe_unused)
{
}
+static struct inline_node *addr2inlines(const char *dso_name, u64 addr,
+ struct dso *dso __maybe_unused)
+{
+ FILE *fp;
+ char cmd[PATH_MAX];
+ struct inline_node *node;
+ char *filename = NULL;
+ size_t len;
+ unsigned int line_nr = 0;
+
+ scnprintf(cmd, sizeof(cmd), "addr2line -e %s -i %016"PRIx64,
+ dso_name, addr);
+
+ fp = popen(cmd, "r");
+ if (fp == NULL) {
+ pr_err("popen failed for %s\n", dso_name);
+ return NULL;
+ }
+
+ node = zalloc(sizeof(*node));
+ if (node == NULL) {
+ perror("not enough memory for the inline node");
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&node->val);
+ node->addr = addr;
+
+ while (getline(&filename, &len, fp) != -1) {
+ if (filename_split(filename, &line_nr) != 1) {
+ free(filename);
+ goto out;
+ }
+
+ if (inline_list__append(filename, NULL, line_nr, node,
+ NULL) != 0)
+ goto out;
+
+ filename = NULL;
+ }
+
+out:
+ pclose(fp);
+
+ if (list_empty(&node->val)) {
+ inline_node__delete(node);
+ return NULL;
+ }
+
+ return node;
+}
+
#endif /* HAVE_LIBBFD_SUPPORT */
/*
@@ -268,7 +429,7 @@ void dso__free_a2l(struct dso *dso __maybe_unused)
#define A2L_FAIL_LIMIT 123
char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
- bool show_sym, bool unwind_inlines)
+ bool show_sym, bool show_addr, bool unwind_inlines)
{
char *file = NULL;
unsigned line = 0;
@@ -278,18 +439,11 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
if (!dso->has_srcline)
goto out;
- if (dso->symsrc_filename)
- dso_name = dso->symsrc_filename;
- else
- dso_name = dso->long_name;
-
- if (dso_name[0] == '[')
- goto out;
-
- if (!strncmp(dso_name, "/tmp/perf-", 10))
+ dso_name = dso__name(dso);
+ if (dso_name == NULL)
goto out;
- if (!addr2line(dso_name, addr, &file, &line, dso, unwind_inlines))
+ if (!addr2line(dso_name, addr, &file, &line, dso, unwind_inlines, NULL))
goto out;
if (asprintf(&srcline, "%s:%u",
@@ -309,6 +463,11 @@ out:
dso->has_srcline = 0;
dso__free_a2l(dso);
}
+
+ if (!show_addr)
+ return (show_sym && sym) ?
+ strndup(sym->name, sym->namelen) : NULL;
+
if (sym) {
if (asprintf(&srcline, "%s+%" PRIu64, show_sym ? sym->name : "",
addr - sym->start) < 0)
@@ -325,7 +484,32 @@ void free_srcline(char *srcline)
}
char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
- bool show_sym)
+ bool show_sym, bool show_addr)
+{
+ return __get_srcline(dso, addr, sym, show_sym, show_addr, false);
+}
+
+struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr)
{
- return __get_srcline(dso, addr, sym, show_sym, false);
+ const char *dso_name;
+
+ dso_name = dso__name(dso);
+ if (dso_name == NULL)
+ return NULL;
+
+ return addr2inlines(dso_name, addr, dso);
+}
+
+void inline_node__delete(struct inline_node *node)
+{
+ struct inline_list *ilist, *tmp;
+
+ list_for_each_entry_safe(ilist, tmp, &node->val, list) {
+ list_del_init(&ilist->list);
+ zfree(&ilist->filename);
+ zfree(&ilist->funcname);
+ free(ilist);
+ }
+
+ free(node);
}
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 8a2bbd2a4d82..ac10cc675d39 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -3,6 +3,9 @@
#include "stat.h"
#include "color.h"
#include "pmu.h"
+#include "rblist.h"
+#include "evlist.h"
+#include "expr.h"
enum {
CTX_BIT_USER = 1 << 0,
@@ -41,13 +44,73 @@ static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
+static struct rblist runtime_saved_values;
static bool have_frontend_stalled;
struct stats walltime_nsecs_stats;
+struct saved_value {
+ struct rb_node rb_node;
+ struct perf_evsel *evsel;
+ int cpu;
+ int ctx;
+ struct stats stats;
+};
+
+static int saved_value_cmp(struct rb_node *rb_node, const void *entry)
+{
+ struct saved_value *a = container_of(rb_node,
+ struct saved_value,
+ rb_node);
+ const struct saved_value *b = entry;
+
+ if (a->ctx != b->ctx)
+ return a->ctx - b->ctx;
+ if (a->cpu != b->cpu)
+ return a->cpu - b->cpu;
+ return a->evsel - b->evsel;
+}
+
+static struct rb_node *saved_value_new(struct rblist *rblist __maybe_unused,
+ const void *entry)
+{
+ struct saved_value *nd = malloc(sizeof(struct saved_value));
+
+ if (!nd)
+ return NULL;
+ memcpy(nd, entry, sizeof(struct saved_value));
+ return &nd->rb_node;
+}
+
+static struct saved_value *saved_value_lookup(struct perf_evsel *evsel,
+ int cpu, int ctx,
+ bool create)
+{
+ struct rb_node *nd;
+ struct saved_value dm = {
+ .cpu = cpu,
+ .ctx = ctx,
+ .evsel = evsel,
+ };
+ nd = rblist__find(&runtime_saved_values, &dm);
+ if (nd)
+ return container_of(nd, struct saved_value, rb_node);
+ if (create) {
+ rblist__add_node(&runtime_saved_values, &dm);
+ nd = rblist__find(&runtime_saved_values, &dm);
+ if (nd)
+ return container_of(nd, struct saved_value, rb_node);
+ }
+ return NULL;
+}
+
void perf_stat__init_shadow_stats(void)
{
have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend");
+ rblist__init(&runtime_saved_values);
+ runtime_saved_values.node_cmp = saved_value_cmp;
+ runtime_saved_values.node_new = saved_value_new;
+ /* No delete for now */
}
static int evsel_context(struct perf_evsel *evsel)
@@ -70,6 +133,8 @@ static int evsel_context(struct perf_evsel *evsel)
void perf_stat__reset_shadow_stats(void)
{
+ struct rb_node *pos, *next;
+
memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats));
memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats));
memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats));
@@ -92,6 +157,15 @@ void perf_stat__reset_shadow_stats(void)
memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
+
+ next = rb_first(&runtime_saved_values.entries);
+ while (next) {
+ pos = next;
+ next = rb_next(pos);
+ memset(&container_of(pos, struct saved_value, rb_node)->stats,
+ 0,
+ sizeof(struct stats));
+ }
}
/*
@@ -143,6 +217,12 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]);
+
+ if (counter->collect_stat) {
+ struct saved_value *v = saved_value_lookup(counter, cpu, ctx,
+ true);
+ update_stats(&v->stats, count[0]);
+ }
}
/* used for get_ratio_color() */
@@ -172,6 +252,95 @@ static const char *get_ratio_color(enum grc_type type, double ratio)
return color;
}
+static struct perf_evsel *perf_stat__find_event(struct perf_evlist *evsel_list,
+ const char *name)
+{
+ struct perf_evsel *c2;
+
+ evlist__for_each_entry (evsel_list, c2) {
+ if (!strcasecmp(c2->name, name))
+ return c2;
+ }
+ return NULL;
+}
+
+/* Mark MetricExpr target events and link events using them to them. */
+void perf_stat__collect_metric_expr(struct perf_evlist *evsel_list)
+{
+ struct perf_evsel *counter, *leader, **metric_events, *oc;
+ bool found;
+ const char **metric_names;
+ int i;
+ int num_metric_names;
+
+ evlist__for_each_entry(evsel_list, counter) {
+ bool invalid = false;
+
+ leader = counter->leader;
+ if (!counter->metric_expr)
+ continue;
+ metric_events = counter->metric_events;
+ if (!metric_events) {
+ if (expr__find_other(counter->metric_expr, counter->name,
+ &metric_names, &num_metric_names) < 0)
+ continue;
+
+ metric_events = calloc(sizeof(struct perf_evsel *),
+ num_metric_names + 1);
+ if (!metric_events)
+ return;
+ counter->metric_events = metric_events;
+ }
+
+ for (i = 0; i < num_metric_names; i++) {
+ found = false;
+ if (leader) {
+ /* Search in group */
+ for_each_group_member (oc, leader) {
+ if (!strcasecmp(oc->name, metric_names[i])) {
+ found = true;
+ break;
+ }
+ }
+ }
+ if (!found) {
+ /* Search ignoring groups */
+ oc = perf_stat__find_event(evsel_list, metric_names[i]);
+ }
+ if (!oc) {
+ /* Deduping one is good enough to handle duplicated PMUs. */
+ static char *printed;
+
+ /*
+ * Adding events automatically would be difficult, because
+ * it would risk creating groups that are not schedulable.
+ * perf stat doesn't understand all the scheduling constraints
+ * of events. So we ask the user instead to add the missing
+ * events.
+ */
+ if (!printed || strcasecmp(printed, metric_names[i])) {
+ fprintf(stderr,
+ "Add %s event to groups to get metric expression for %s\n",
+ metric_names[i],
+ counter->name);
+ printed = strdup(metric_names[i]);
+ }
+ invalid = true;
+ continue;
+ }
+ metric_events[i] = oc;
+ oc->collect_stat = true;
+ }
+ metric_events[i] = NULL;
+ free(metric_names);
+ if (invalid) {
+ free(metric_events);
+ counter->metric_events = NULL;
+ counter->metric_expr = NULL;
+ }
+ }
+}
+
static void print_stalled_cycles_frontend(int cpu,
struct perf_evsel *evsel, double avg,
struct perf_stat_output_ctx *out)
@@ -614,6 +783,34 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
be_bound * 100.);
else
print_metric(ctxp, NULL, NULL, name, 0);
+ } else if (evsel->metric_expr) {
+ struct parse_ctx pctx;
+ int i;
+
+ expr__ctx_init(&pctx);
+ expr__add_id(&pctx, evsel->name, avg);
+ for (i = 0; evsel->metric_events[i]; i++) {
+ struct saved_value *v;
+
+ v = saved_value_lookup(evsel->metric_events[i], cpu, ctx, false);
+ if (!v)
+ break;
+ expr__add_id(&pctx, evsel->metric_events[i]->name,
+ avg_stats(&v->stats));
+ }
+ if (!evsel->metric_events[i]) {
+ const char *p = evsel->metric_expr;
+
+ if (expr__parse(&ratio, &pctx, &p) == 0)
+ print_metric(ctxp, NULL, "%8.1f",
+ evsel->metric_name ?
+ evsel->metric_name :
+ out->force_header ? evsel->name : "",
+ ratio);
+ else
+ print_metric(ctxp, NULL, NULL, "", 0);
+ } else
+ print_metric(ctxp, NULL, NULL, "", 0);
} else if (runtime_nsecs_stats[cpu].n != 0) {
char unit = 'M';
char unit_buf[10];
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index c29bb94c48a4..0a65ae23f495 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -85,11 +85,13 @@ struct perf_stat_output_ctx {
void *ctx;
print_metric_t print_metric;
new_line_t new_line;
+ bool force_header;
};
void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
double avg, int cpu,
struct perf_stat_output_ctx *out);
+void perf_stat__collect_metric_expr(struct perf_evlist *);
int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw);
void perf_evlist__free_stats(struct perf_evlist *evlist);
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 4e59ddeb4eda..d1a40bb642ff 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -390,6 +390,11 @@ out_elf_end:
return 0;
}
+char *dso__demangle_sym(struct dso *dso, int kmodule, char *elf_name)
+{
+ return demangle_sym(dso, kmodule, elf_name);
+}
+
/*
* Align offset to 4 bytes as needed for note name and descriptor data.
*/
@@ -1828,7 +1833,7 @@ void kcore_extract__delete(struct kcore_extract *kce)
static int populate_sdt_note(Elf **elf, const char *data, size_t len,
struct list_head *sdt_notes)
{
- const char *provider, *name;
+ const char *provider, *name, *args;
struct sdt_note *tmp = NULL;
GElf_Ehdr ehdr;
GElf_Addr base_off = 0;
@@ -1887,6 +1892,25 @@ static int populate_sdt_note(Elf **elf, const char *data, size_t len,
goto out_free_prov;
}
+ args = memchr(name, '\0', data + len - name);
+
+ /*
+ * There is no argument if:
+ * - We reached the end of the note;
+ * - There is not enough room to hold a potential string;
+ * - The argument string is empty or just contains ':'.
+ */
+ if (args == NULL || data + len - args < 2 ||
+ args[1] == ':' || args[1] == '\0')
+ tmp->args = NULL;
+ else {
+ tmp->args = strdup(++args);
+ if (!tmp->args) {
+ ret = -ENOMEM;
+ goto out_free_name;
+ }
+ }
+
if (gelf_getclass(*elf) == ELFCLASS32) {
memcpy(&tmp->addr, &buf, 3 * sizeof(Elf32_Addr));
tmp->bit32 = true;
@@ -1898,7 +1922,7 @@ static int populate_sdt_note(Elf **elf, const char *data, size_t len,
if (!gelf_getehdr(*elf, &ehdr)) {
pr_debug("%s : cannot get elf header.\n", __func__);
ret = -EBADF;
- goto out_free_name;
+ goto out_free_args;
}
/* Adjust the prelink effect :
@@ -1923,6 +1947,8 @@ static int populate_sdt_note(Elf **elf, const char *data, size_t len,
list_add_tail(&tmp->note_list, sdt_notes);
return 0;
+out_free_args:
+ free(tmp->args);
out_free_name:
free(tmp->name);
out_free_prov:
diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c
index 11cdde980545..870ef0f0659c 100644
--- a/tools/perf/util/symbol-minimal.c
+++ b/tools/perf/util/symbol-minimal.c
@@ -373,3 +373,10 @@ int kcore_copy(const char *from_dir __maybe_unused,
void symbol__elf_init(void)
{
}
+
+char *dso__demangle_sym(struct dso *dso __maybe_unused,
+ int kmodule __maybe_unused,
+ char *elf_name __maybe_unused)
+{
+ return NULL;
+}
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 6c358b7ed336..5245d2fb1a0a 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -118,7 +118,8 @@ struct symbol_conf {
show_ref_callgraph,
hide_unresolved,
raw_trace,
- report_hierarchy;
+ report_hierarchy,
+ inline_name;
const char *vmlinux_name,
*kallsyms_name,
*source_prefix,
@@ -305,6 +306,8 @@ int dso__load_sym(struct dso *dso, struct map *map, struct symsrc *syms_ss,
int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss,
struct map *map);
+char *dso__demangle_sym(struct dso *dso, int kmodule, char *elf_name);
+
void __symbols__insert(struct rb_root *symbols, struct symbol *sym, bool kernel);
void symbols__insert(struct rb_root *symbols, struct symbol *sym);
void symbols__fixup_duplicate(struct rb_root *symbols);
@@ -351,6 +354,7 @@ int arch__choose_best_symbol(struct symbol *syma, struct symbol *symb);
struct sdt_note {
char *name; /* name of the note*/
char *provider; /* provider name */
+ char *args;
bool bit32; /* whether the location is 32 bits? */
union { /* location, base and semaphore addrs */
Elf64_Addr a64[3];
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index f5af87f66663..dcdb87a5d0a1 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -7,6 +7,7 @@
#include "thread-stack.h"
#include "util.h"
#include "debug.h"
+#include "namespaces.h"
#include "comm.h"
#include "unwind.h"
@@ -40,6 +41,7 @@ struct thread *thread__new(pid_t pid, pid_t tid)
thread->tid = tid;
thread->ppid = -1;
thread->cpu = -1;
+ INIT_LIST_HEAD(&thread->namespaces_list);
INIT_LIST_HEAD(&thread->comm_list);
comm_str = malloc(32);
@@ -53,7 +55,7 @@ struct thread *thread__new(pid_t pid, pid_t tid)
goto err_thread;
list_add(&comm->list, &thread->comm_list);
- atomic_set(&thread->refcnt, 1);
+ refcount_set(&thread->refcnt, 1);
RB_CLEAR_NODE(&thread->rb_node);
}
@@ -66,7 +68,8 @@ err_thread:
void thread__delete(struct thread *thread)
{
- struct comm *comm, *tmp;
+ struct namespaces *namespaces, *tmp_namespaces;
+ struct comm *comm, *tmp_comm;
BUG_ON(!RB_EMPTY_NODE(&thread->rb_node));
@@ -76,7 +79,12 @@ void thread__delete(struct thread *thread)
map_groups__put(thread->mg);
thread->mg = NULL;
}
- list_for_each_entry_safe(comm, tmp, &thread->comm_list, list) {
+ list_for_each_entry_safe(namespaces, tmp_namespaces,
+ &thread->namespaces_list, list) {
+ list_del(&namespaces->list);
+ namespaces__free(namespaces);
+ }
+ list_for_each_entry_safe(comm, tmp_comm, &thread->comm_list, list) {
list_del(&comm->list);
comm__free(comm);
}
@@ -88,13 +96,13 @@ void thread__delete(struct thread *thread)
struct thread *thread__get(struct thread *thread)
{
if (thread)
- atomic_inc(&thread->refcnt);
+ refcount_inc(&thread->refcnt);
return thread;
}
void thread__put(struct thread *thread)
{
- if (thread && atomic_dec_and_test(&thread->refcnt)) {
+ if (thread && refcount_dec_and_test(&thread->refcnt)) {
/*
* Remove it from the dead_threads list, as last reference
* is gone.
@@ -104,6 +112,38 @@ void thread__put(struct thread *thread)
}
}
+struct namespaces *thread__namespaces(const struct thread *thread)
+{
+ if (list_empty(&thread->namespaces_list))
+ return NULL;
+
+ return list_first_entry(&thread->namespaces_list, struct namespaces, list);
+}
+
+int thread__set_namespaces(struct thread *thread, u64 timestamp,
+ struct namespaces_event *event)
+{
+ struct namespaces *new, *curr = thread__namespaces(thread);
+
+ new = namespaces__new(event);
+ if (!new)
+ return -ENOMEM;
+
+ list_add(&new->list, &thread->namespaces_list);
+
+ if (timestamp && curr) {
+ /*
+ * setns syscall must have changed few or all the namespaces
+ * of this thread. Update end time for the namespaces
+ * previously used.
+ */
+ curr = list_next_entry(new, list);
+ curr->end_time = timestamp;
+ }
+
+ return 0;
+}
+
struct comm *thread__comm(const struct thread *thread)
{
if (list_empty(&thread->comm_list))
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 99263cb6e6b6..4eb849e9098f 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -1,7 +1,7 @@
#ifndef __PERF_THREAD_H
#define __PERF_THREAD_H
-#include <linux/atomic.h>
+#include <linux/refcount.h>
#include <linux/rbtree.h>
#include <linux/list.h>
#include <unistd.h>
@@ -23,11 +23,12 @@ struct thread {
pid_t tid;
pid_t ppid;
int cpu;
- atomic_t refcnt;
+ refcount_t refcnt;
char shortname[3];
bool comm_set;
int comm_len;
bool dead; /* if set thread has exited */
+ struct list_head namespaces_list;
struct list_head comm_list;
u64 db_id;
@@ -40,6 +41,7 @@ struct thread {
};
struct machine;
+struct namespaces;
struct comm;
struct thread *thread__new(pid_t pid, pid_t tid);
@@ -62,6 +64,10 @@ static inline void thread__exited(struct thread *thread)
thread->dead = true;
}
+struct namespaces *thread__namespaces(const struct thread *thread);
+int thread__set_namespaces(struct thread *thread, u64 timestamp,
+ struct namespaces_event *event);
+
int __thread__set_comm(struct thread *thread, const char *comm, u64 timestamp,
bool exec);
static inline int thread__set_comm(struct thread *thread, const char *comm,
diff --git a/tools/perf/util/thread_map.c b/tools/perf/util/thread_map.c
index 7c3fcc538a70..9026408ea55b 100644
--- a/tools/perf/util/thread_map.c
+++ b/tools/perf/util/thread_map.c
@@ -66,7 +66,7 @@ struct thread_map *thread_map__new_by_pid(pid_t pid)
for (i = 0; i < items; i++)
thread_map__set_pid(threads, i, atoi(namelist[i]->d_name));
threads->nr = items;
- atomic_set(&threads->refcnt, 1);
+ refcount_set(&threads->refcnt, 1);
}
for (i=0; i<items; i++)
@@ -83,7 +83,7 @@ struct thread_map *thread_map__new_by_tid(pid_t tid)
if (threads != NULL) {
thread_map__set_pid(threads, 0, tid);
threads->nr = 1;
- atomic_set(&threads->refcnt, 1);
+ refcount_set(&threads->refcnt, 1);
}
return threads;
@@ -105,7 +105,7 @@ struct thread_map *thread_map__new_by_uid(uid_t uid)
goto out_free_threads;
threads->nr = 0;
- atomic_set(&threads->refcnt, 1);
+ refcount_set(&threads->refcnt, 1);
while ((dirent = readdir(proc)) != NULL) {
char *end;
@@ -235,7 +235,7 @@ static struct thread_map *thread_map__new_by_pid_str(const char *pid_str)
out:
strlist__delete(slist);
if (threads)
- atomic_set(&threads->refcnt, 1);
+ refcount_set(&threads->refcnt, 1);
return threads;
out_free_namelist:
@@ -255,7 +255,7 @@ struct thread_map *thread_map__new_dummy(void)
if (threads != NULL) {
thread_map__set_pid(threads, 0, -1);
threads->nr = 1;
- atomic_set(&threads->refcnt, 1);
+ refcount_set(&threads->refcnt, 1);
}
return threads;
}
@@ -300,7 +300,7 @@ struct thread_map *thread_map__new_by_tid_str(const char *tid_str)
}
out:
if (threads)
- atomic_set(&threads->refcnt, 1);
+ refcount_set(&threads->refcnt, 1);
return threads;
out_free_threads:
@@ -326,7 +326,7 @@ static void thread_map__delete(struct thread_map *threads)
if (threads) {
int i;
- WARN_ONCE(atomic_read(&threads->refcnt) != 0,
+ WARN_ONCE(refcount_read(&threads->refcnt) != 0,
"thread map refcnt unbalanced\n");
for (i = 0; i < threads->nr; i++)
free(thread_map__comm(threads, i));
@@ -337,13 +337,13 @@ static void thread_map__delete(struct thread_map *threads)
struct thread_map *thread_map__get(struct thread_map *map)
{
if (map)
- atomic_inc(&map->refcnt);
+ refcount_inc(&map->refcnt);
return map;
}
void thread_map__put(struct thread_map *map)
{
- if (map && atomic_dec_and_test(&map->refcnt))
+ if (map && refcount_dec_and_test(&map->refcnt))
thread_map__delete(map);
}
@@ -423,7 +423,7 @@ static void thread_map__copy_event(struct thread_map *threads,
threads->map[i].comm = strndup(event->entries[i].comm, 16);
}
- atomic_set(&threads->refcnt, 1);
+ refcount_set(&threads->refcnt, 1);
}
struct thread_map *thread_map__new_event(struct thread_map_event *event)
diff --git a/tools/perf/util/thread_map.h b/tools/perf/util/thread_map.h
index ea0ef08c6303..bd34d7a0b9fa 100644
--- a/tools/perf/util/thread_map.h
+++ b/tools/perf/util/thread_map.h
@@ -3,7 +3,7 @@
#include <sys/types.h>
#include <stdio.h>
-#include <linux/atomic.h>
+#include <linux/refcount.h>
struct thread_map_data {
pid_t pid;
@@ -11,7 +11,7 @@ struct thread_map_data {
};
struct thread_map {
- atomic_t refcnt;
+ refcount_t refcnt;
int nr;
struct thread_map_data map[];
};
diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h
index ac2590a3de2d..829471a1c6d7 100644
--- a/tools/perf/util/tool.h
+++ b/tools/perf/util/tool.h
@@ -40,6 +40,7 @@ struct perf_tool {
event_op mmap,
mmap2,
comm,
+ namespaces,
fork,
exit,
lost,
@@ -66,6 +67,7 @@ struct perf_tool {
event_op3 auxtrace;
bool ordered_events;
bool ordering_requires_timestamps;
+ bool namespace_events;
};
#endif /* __PERF_TOOL_H */
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index 27420159bf69..8a9a677f7576 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -192,7 +192,7 @@ static int read_ftrace_printk(struct pevent *pevent)
if (!size)
return 0;
- buf = malloc(size);
+ buf = malloc(size + 1);
if (buf == NULL)
return -1;
@@ -201,6 +201,8 @@ static int read_ftrace_printk(struct pevent *pevent)
return -1;
}
+ buf[size] = '\0';
+
parse_ftrace_printk(pevent, buf, size);
free(buf);
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index c74708da8571..7cf5752b38fd 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -287,9 +287,9 @@ struct symbol;
extern bool srcline_full_filename;
char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
- bool show_sym);
+ bool show_sym, bool show_addr);
char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
- bool show_sym, bool unwind_inlines);
+ bool show_sym, bool show_addr, bool unwind_inlines);
void free_srcline(char *srcline);
int perf_event_paranoid(void);
@@ -355,8 +355,8 @@ void print_binary(unsigned char *data, size_t len,
size_t bytes_per_line, print_binary_t printer,
void *extra);
-#if !defined(__GLIBC__) && !defined(__ANDROID__)
-extern int sched_getcpu(void);
+#ifndef HAVE_SCHED_GETCPU_SUPPORT
+int sched_getcpu(void);
#endif
int is_printable_array(char *p, unsigned int len);
@@ -364,4 +364,20 @@ int is_printable_array(char *p, unsigned int len);
int timestamp__scnprintf_usec(u64 timestamp, char *buf, size_t sz);
int unit_number__scnprintf(char *buf, size_t size, u64 n);
+
+struct inline_list {
+ char *filename;
+ char *funcname;
+ unsigned int line_nr;
+ struct list_head list;
+};
+
+struct inline_node {
+ u64 addr;
+ struct list_head val;
+};
+
+struct inline_node *dso__parse_addr_inlines(struct dso *dso, u64 addr);
+void inline_node__delete(struct inline_node *node);
+
#endif /* GIT_COMPAT_UTIL_H */
diff --git a/tools/perf/util/values.c b/tools/perf/util/values.c
index 5074be4ed467..5de2e15e2eda 100644
--- a/tools/perf/util/values.c
+++ b/tools/perf/util/values.c
@@ -1,4 +1,7 @@
+#include <inttypes.h>
+#include <stdio.h>
#include <stdlib.h>
+#include <errno.h>
#include "util.h"
#include "values.h"
@@ -108,24 +111,45 @@ static int perf_read_values__findnew_thread(struct perf_read_values *values,
return i;
}
-static void perf_read_values__enlarge_counters(struct perf_read_values *values)
+static int perf_read_values__enlarge_counters(struct perf_read_values *values)
{
- int i;
+ char **countername;
+ int i, counters_max = values->counters_max * 2;
+ u64 *counterrawid = realloc(values->counterrawid, counters_max * sizeof(*values->counterrawid));
+
+ if (!counterrawid) {
+ pr_debug("failed to enlarge read_values rawid array");
+ goto out_enomem;
+ }
- values->counters_max *= 2;
- values->counterrawid = realloc(values->counterrawid,
- values->counters_max * sizeof(*values->counterrawid));
- values->countername = realloc(values->countername,
- values->counters_max * sizeof(*values->countername));
- if (!values->counterrawid || !values->countername)
- die("failed to enlarge read_values counters arrays");
+ countername = realloc(values->countername, counters_max * sizeof(*values->countername));
+ if (!countername) {
+ pr_debug("failed to enlarge read_values rawid array");
+ goto out_free_rawid;
+ }
for (i = 0; i < values->threads; i++) {
- values->value[i] = realloc(values->value[i],
- values->counters_max * sizeof(**values->value));
- if (!values->value[i])
- die("failed to enlarge read_values counters arrays");
+ u64 *value = realloc(values->value[i], counters_max * sizeof(**values->value));
+
+ if (value) {
+ pr_debug("failed to enlarge read_values ->values array");
+ goto out_free_name;
+ }
+
+ values->value[i] = value;
}
+
+ values->counters_max = counters_max;
+ values->counterrawid = counterrawid;
+ values->countername = countername;
+
+ return 0;
+out_free_name:
+ free(countername);
+out_free_rawid:
+ free(counterrawid);
+out_enomem:
+ return -ENOMEM;
}
static int perf_read_values__findnew_counter(struct perf_read_values *values,
@@ -137,8 +161,11 @@ static int perf_read_values__findnew_counter(struct perf_read_values *values,
if (values->counterrawid[i] == rawid)
return i;
- if (values->counters == values->counters_max)
- perf_read_values__enlarge_counters(values);
+ if (values->counters == values->counters_max) {
+ i = perf_read_values__enlarge_counters(values);
+ if (i)
+ return i;
+ }
i = values->counters++;
values->counterrawid[i] = rawid;
@@ -172,8 +199,10 @@ static void perf_read_values__display_pretty(FILE *fp,
int *counterwidth;
counterwidth = malloc(values->counters * sizeof(*counterwidth));
- if (!counterwidth)
- die("failed to allocate counterwidth array");
+ if (!counterwidth) {
+ fprintf(fp, "INTERNAL ERROR: Failed to allocate counterwidth array\n");
+ return;
+ }
tidwidth = 3;
pidwidth = 3;
for (j = 0; j < values->counters; j++)
diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include
index 621578aa12d6..fc74db62fef4 100644
--- a/tools/scripts/Makefile.include
+++ b/tools/scripts/Makefile.include
@@ -43,6 +43,15 @@ ifneq ($(CC), clang)
EXTRA_WARNINGS += -Wstrict-aliasing=3
endif
+# Hack to avoid type-punned warnings on old systems such as RHEL5:
+# We should be changing CFLAGS and checking gcc version, but this
+# will do for now and keep the above -Wstrict-aliasing=3 in place
+# in newer systems.
+# Needed for the __raw_cmpxchg in tools/arch/x86/include/asm/cmpxchg.h
+ifneq ($(filter 3.%,$(MAKE_VERSION)),) # make-3
+EXTRA_WARNINGS += -fno-strict-aliasing
+endif
+
ifneq ($(findstring $(MAKEFLAGS), w),w)
PRINT_DIR = --no-print-directory
else
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
index f6121612e769..b9a22f18566a 100644
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -409,6 +409,51 @@ static void *threadproc(void *ctx)
}
}
+#ifdef __i386__
+
+#ifndef SA_RESTORE
+#define SA_RESTORER 0x04000000
+#endif
+
+/*
+ * The UAPI header calls this 'struct sigaction', which conflicts with
+ * glibc. Sigh.
+ */
+struct fake_ksigaction {
+ void *handler; /* the real type is nasty */
+ unsigned long sa_flags;
+ void (*sa_restorer)(void);
+ unsigned char sigset[8];
+};
+
+static void fix_sa_restorer(int sig)
+{
+ struct fake_ksigaction ksa;
+
+ if (syscall(SYS_rt_sigaction, sig, NULL, &ksa, 8) == 0) {
+ /*
+ * glibc has a nasty bug: it sometimes writes garbage to
+ * sa_restorer. This interacts quite badly with anything
+ * that fiddles with SS because it can trigger legacy
+ * stack switching. Patch it up. See:
+ *
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=21269
+ */
+ if (!(ksa.sa_flags & SA_RESTORER) && ksa.sa_restorer) {
+ ksa.sa_restorer = NULL;
+ if (syscall(SYS_rt_sigaction, sig, &ksa, NULL,
+ sizeof(ksa.sigset)) != 0)
+ err(1, "rt_sigaction");
+ }
+ }
+}
+#else
+static void fix_sa_restorer(int sig)
+{
+ /* 64-bit glibc works fine. */
+}
+#endif
+
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags)
{
@@ -420,6 +465,7 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
+ fix_sa_restorer(sig);
}
static jmp_buf jmpbuf;