summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2023-04-11 12:36:47 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2023-04-11 12:36:49 +1000
commite92638cf57c5e99b0b641f305f5b6fe27212f086 (patch)
tree0e5aec6127ef2a0d0517c29084cc91165e9f41a3
parent29bb20ddcec3979037005819fded9b1604da3382 (diff)
parentb2e7ae549d84c654744bf70d30a881bfa140a6e3 (diff)
downloadlinux-next-e92638cf57c5e99b0b641f305f5b6fe27212f086.tar.gz
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt17
-rw-r--r--Documentation/arch/x86/index.rst1
-rw-r--r--Documentation/arch/x86/microcode.rst10
-rw-r--r--Documentation/arch/x86/shstk.rst179
-rw-r--r--Documentation/arch/x86/xstate.rst100
-rw-r--r--Documentation/filesystems/proc.rst1
-rw-r--r--Documentation/livepatch/reliable-stacktrace.rst2
-rw-r--r--Documentation/mm/arch_pgtable_helpers.rst9
-rw-r--r--Documentation/virt/coco/sev-guest.rst20
-rw-r--r--MAINTAINERS13
-rw-r--r--arch/alpha/include/asm/cmpxchg.h10
-rw-r--r--arch/alpha/include/asm/pgtable.h6
-rw-r--r--arch/alpha/kernel/process.c4
-rw-r--r--arch/alpha/kernel/smp.c2
-rw-r--r--arch/arc/include/asm/cmpxchg.h4
-rw-r--r--arch/arc/include/asm/hugepage.h2
-rw-r--r--arch/arc/include/asm/pgtable-bits-arcv2.h7
-rw-r--r--arch/arc/kernel/smp.c2
-rw-r--r--arch/arm/include/asm/cmpxchg.h7
-rw-r--r--arch/arm/include/asm/pgtable-3level.h7
-rw-r--r--arch/arm/include/asm/pgtable.h2
-rw-r--r--arch/arm/kernel/signal.c2
-rw-r--r--arch/arm/kernel/smp.c7
-rw-r--r--arch/arm/mach-actions/platsmp.c2
-rw-r--r--arch/arm/probes/uprobes/core.c8
-rw-r--r--arch/arm/vdso/Makefile4
-rw-r--r--arch/arm64/include/asm/cmpxchg.h7
-rw-r--r--arch/arm64/include/asm/mmu_context.h6
-rw-r--r--arch/arm64/include/asm/pgtable.h9
-rw-r--r--arch/arm64/include/asm/smp.h2
-rw-r--r--arch/arm64/kernel/probes/uprobes.c9
-rw-r--r--arch/arm64/kernel/process.c2
-rw-r--r--arch/arm64/kernel/signal.c2
-rw-r--r--arch/arm64/kernel/signal32.c2
-rw-r--r--arch/arm64/kernel/smp.c5
-rw-r--r--arch/arm64/kernel/vdso/Makefile4
-rw-r--r--arch/arm64/kernel/vdso32/Makefile3
-rw-r--r--arch/arm64/mm/trans_pgd.c4
-rw-r--r--arch/csky/include/asm/pgtable.h2
-rw-r--r--arch/csky/kernel/probes/uprobes.c9
-rw-r--r--arch/csky/kernel/smp.c6
-rw-r--r--arch/csky/kernel/vdso/Makefile4
-rw-r--r--arch/hexagon/include/asm/cmpxchg.h10
-rw-r--r--arch/hexagon/include/asm/pgtable.h2
-rw-r--r--arch/hexagon/kernel/smp.c2
-rw-r--r--arch/ia64/include/asm/cmpxchg.h2
-rw-r--r--arch/ia64/include/asm/pgtable.h2
-rw-r--r--arch/ia64/include/uapi/asm/cmpxchg.h4
-rw-r--r--arch/ia64/kernel/process.c6
-rw-r--r--arch/ia64/kernel/smp.c4
-rw-r--r--arch/loongarch/include/asm/cmpxchg.h4
-rw-r--r--arch/loongarch/include/asm/pgtable.h4
-rw-r--r--arch/loongarch/include/asm/smp.h2
-rw-r--r--arch/loongarch/kernel/process.c2
-rw-r--r--arch/loongarch/kernel/smp.c6
-rw-r--r--arch/loongarch/vdso/Makefile4
-rw-r--r--arch/m68k/include/asm/cmpxchg.h6
-rw-r--r--arch/m68k/include/asm/mcf_pgtable.h2
-rw-r--r--arch/m68k/include/asm/motorola_pgtable.h6
-rw-r--r--arch/m68k/include/asm/sun3_pgtable.h6
-rw-r--r--arch/microblaze/include/asm/pgtable.h2
-rw-r--r--arch/mips/cavium-octeon/smp.c1
-rw-r--r--arch/mips/include/asm/cmpxchg.h4
-rw-r--r--arch/mips/include/asm/pgtable.h6
-rw-r--r--arch/mips/include/asm/smp.h4
-rw-r--r--arch/mips/kernel/process.c2
-rw-r--r--arch/mips/kernel/rtlx-cmp.c2
-rw-r--r--arch/mips/kernel/smp-bmips.c4
-rw-r--r--arch/mips/kernel/smp-cps.c1
-rw-r--r--arch/mips/kernel/uprobes.c10
-rw-r--r--arch/mips/loongson64/smp.c2
-rw-r--r--arch/mips/vdso/Makefile4
-rw-r--r--arch/nios2/include/asm/pgtable.h2
-rw-r--r--arch/openrisc/include/asm/cmpxchg.h10
-rw-r--r--arch/openrisc/include/asm/pgtable.h2
-rw-r--r--arch/openrisc/kernel/smp.c2
-rw-r--r--arch/parisc/include/asm/cmpxchg.h4
-rw-r--r--arch/parisc/include/asm/pgtable.h6
-rw-r--r--arch/parisc/kernel/process.c2
-rw-r--r--arch/parisc/kernel/smp.c4
-rw-r--r--arch/powerpc/include/asm/book3s/32/pgtable.h2
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h4
-rw-r--r--arch/powerpc/include/asm/cmpxchg.h4
-rw-r--r--arch/powerpc/include/asm/nohash/32/pgtable.h2
-rw-r--r--arch/powerpc/include/asm/nohash/32/pte-8xx.h2
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h2
-rw-r--r--arch/powerpc/include/asm/smp.h2
-rw-r--r--arch/powerpc/kernel/smp.c8
-rw-r--r--arch/powerpc/kernel/uprobes.c10
-rw-r--r--arch/powerpc/kernel/vdso/Makefile2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c3
-rw-r--r--arch/powerpc/platforms/powernv/subcore.c2
-rw-r--r--arch/riscv/include/asm/atomic.h2
-rw-r--r--arch/riscv/include/asm/cmpxchg.h4
-rw-r--r--arch/riscv/include/asm/pgtable.h6
-rw-r--r--arch/riscv/kernel/cpu-hotplug.c2
-rw-r--r--arch/riscv/kernel/probes/uprobes.c9
-rw-r--r--arch/riscv/kernel/smp.c4
-rw-r--r--arch/riscv/kernel/vdso/Makefile4
-rw-r--r--arch/s390/include/asm/cmpxchg.h8
-rw-r--r--arch/s390/include/asm/hugetlb.h4
-rw-r--r--arch/s390/include/asm/pgtable.h14
-rw-r--r--arch/s390/kernel/idle.c2
-rw-r--r--arch/s390/kernel/smp.c2
-rw-r--r--arch/s390/kernel/uprobes.c7
-rw-r--r--arch/s390/kernel/vdso32/Makefile3
-rw-r--r--arch/s390/kernel/vdso64/Makefile3
-rw-r--r--arch/s390/mm/pageattr.c4
-rw-r--r--arch/sh/include/asm/cmpxchg.h4
-rw-r--r--arch/sh/include/asm/pgtable_32.h10
-rw-r--r--arch/sh/include/asm/smp-ops.h5
-rw-r--r--arch/sh/kernel/idle.c3
-rw-r--r--arch/sh/kernel/smp.c2
-rw-r--r--arch/sparc/include/asm/cmpxchg_32.h4
-rw-r--r--arch/sparc/include/asm/cmpxchg_64.h6
-rw-r--r--arch/sparc/include/asm/mmu_context_64.h6
-rw-r--r--arch/sparc/include/asm/pgtable_32.h2
-rw-r--r--arch/sparc/include/asm/pgtable_64.h6
-rw-r--r--arch/sparc/include/asm/smp_64.h2
-rw-r--r--arch/sparc/include/asm/uaccess_64.h2
-rw-r--r--arch/sparc/kernel/process_64.c2
-rw-r--r--arch/sparc/kernel/signal32.c2
-rw-r--r--arch/sparc/kernel/signal_64.c2
-rw-r--r--arch/sparc/kernel/smp_32.c2
-rw-r--r--arch/sparc/kernel/smp_64.c2
-rw-r--r--arch/sparc/kernel/uprobes.c7
-rw-r--r--arch/um/include/asm/pgtable.h2
-rw-r--r--arch/x86/Kconfig35
-rw-r--r--arch/x86/Kconfig.assembler5
-rw-r--r--arch/x86/boot/compressed/tdx.c4
-rw-r--r--arch/x86/coco/core.c53
-rw-r--r--arch/x86/coco/tdx/tdcall.S66
-rw-r--r--arch/x86/coco/tdx/tdx.c18
-rw-r--r--arch/x86/entry/entry_64.S28
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/entry/vdso/Makefile5
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c11
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c2
-rw-r--r--arch/x86/events/intel/core.c16
-rw-r--r--arch/x86/events/intel/cstate.c2
-rw-r--r--arch/x86/events/msr.c2
-rw-r--r--arch/x86/hyperv/hv_init.c11
-rw-r--r--arch/x86/hyperv/ivm.c142
-rw-r--r--arch/x86/include/asm/coco.h24
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/disabled-features.h24
-rw-r--r--arch/x86/include/asm/fpu/api.h9
-rw-r--r--arch/x86/include/asm/fpu/regset.h7
-rw-r--r--arch/x86/include/asm/fpu/sched.h3
-rw-r--r--arch/x86/include/asm/fpu/types.h16
-rw-r--r--arch/x86/include/asm/fpu/xstate.h6
-rw-r--r--arch/x86/include/asm/idtentry.h2
-rw-r--r--arch/x86/include/asm/intel-mid.h21
-rw-r--r--arch/x86/include/asm/mem_encrypt.h1
-rw-r--r--arch/x86/include/asm/mmu.h18
-rw-r--r--arch/x86/include/asm/mmu_context.h62
-rw-r--r--arch/x86/include/asm/mshyperv.h16
-rw-r--r--arch/x86/include/asm/nospec-branch.h14
-rw-r--r--arch/x86/include/asm/orc_types.h12
-rw-r--r--arch/x86/include/asm/paravirt.h14
-rw-r--r--arch/x86/include/asm/paravirt_types.h15
-rw-r--r--arch/x86/include/asm/pgtable.h322
-rw-r--r--arch/x86/include/asm/pgtable_types.h56
-rw-r--r--arch/x86/include/asm/processor-flags.h2
-rw-r--r--arch/x86/include/asm/processor.h14
-rw-r--r--arch/x86/include/asm/realmode.h1
-rw-r--r--arch/x86/include/asm/sev-common.h4
-rw-r--r--arch/x86/include/asm/sev.h10
-rw-r--r--arch/x86/include/asm/shared/tdx.h5
-rw-r--r--arch/x86/include/asm/shstk.h38
-rw-r--r--arch/x86/include/asm/smp.h10
-rw-r--r--arch/x86/include/asm/special_insns.h13
-rw-r--r--arch/x86/include/asm/tlbflush.h51
-rw-r--r--arch/x86/include/asm/trap_pf.h2
-rw-r--r--arch/x86/include/asm/traps.h12
-rw-r--r--arch/x86/include/asm/uaccess.h58
-rw-r--r--arch/x86/include/asm/uaccess_64.h2
-rw-r--r--arch/x86/include/asm/unwind_hints.h18
-rw-r--r--arch/x86/include/asm/x86_init.h4
-rw-r--r--arch/x86/include/uapi/asm/mman.h4
-rw-r--r--arch/x86/include/uapi/asm/prctl.h20
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h6
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c11
-rw-r--r--arch/x86/kernel/acpi/sleep.c23
-rw-r--r--arch/x86/kernel/apic/apic.c5
-rw-r--r--arch/x86/kernel/apic/io_apic.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c126
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/cet.c152
-rw-r--r--arch/x86/kernel/cpu/amd.c11
-rw-r--r--arch/x86/kernel/cpu/bugs.c10
-rw-r--r--arch/x86/kernel/cpu/common.c36
-rw-r--r--arch/x86/kernel/cpu/cpu.h8
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c59
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c16
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h10
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c9
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c3
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c15
-rw-r--r--arch/x86/kernel/cpu/proc.c23
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c43
-rw-r--r--arch/x86/kernel/fpu/core.c54
-rw-r--r--arch/x86/kernel/fpu/regset.c81
-rw-r--r--arch/x86/kernel/fpu/xstate.c90
-rw-r--r--arch/x86/kernel/ftrace_64.S2
-rw-r--r--arch/x86/kernel/head_64.S89
-rw-r--r--arch/x86/kernel/idt.c2
-rw-r--r--arch/x86/kernel/itmt.c11
-rw-r--r--arch/x86/kernel/paravirt.c30
-rw-r--r--arch/x86/kernel/process.c32
-rw-r--r--arch/x86/kernel/process_64.c75
-rw-r--r--arch/x86/kernel/ptrace.c12
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S10
-rw-r--r--arch/x86/kernel/sev.c15
-rw-r--r--arch/x86/kernel/shstk.c499
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/signal_32.c2
-rw-r--r--arch/x86/kernel/signal_64.c8
-rw-r--r--arch/x86/kernel/smpboot.c30
-rw-r--r--arch/x86/kernel/sys_x86_64.c6
-rw-r--r--arch/x86/kernel/traps.c93
-rw-r--r--arch/x86/kernel/unwind_orc.c27
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/svm/svm.c4
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--arch/x86/lib/getuser.S83
-rw-r--r--arch/x86/lib/putuser.S54
-rw-r--r--arch/x86/lib/retpoline.S6
-rw-r--r--arch/x86/lib/usercopy_64.c9
-rw-r--r--arch/x86/mm/fault.c22
-rw-r--r--arch/x86/mm/init.c7
-rw-r--r--arch/x86/mm/ioremap.c5
-rw-r--r--arch/x86/mm/mem_encrypt_amd.c10
-rw-r--r--arch/x86/mm/pat/set_memory.c7
-rw-r--r--arch/x86/mm/pgtable.c38
-rw-r--r--arch/x86/mm/tlb.c53
-rw-r--r--arch/x86/platform/pvh/head.S2
-rw-r--r--arch/x86/xen/enlighten_pv.c2
-rw-r--r--arch/x86/xen/mmu_pv.c14
-rw-r--r--arch/x86/xen/xen-asm.S6
-rw-r--r--arch/x86/xen/xen-head.S6
-rw-r--r--arch/xtensa/include/asm/cmpxchg.h4
-rw-r--r--arch/xtensa/include/asm/pgtable.h2
-rw-r--r--arch/xtensa/include/asm/smp.h2
-rw-r--r--arch/xtensa/kernel/smp.c6
-rw-r--r--drivers/crypto/ccp/sev-dev.c22
-rw-r--r--drivers/gpu/drm/i915/gt/intel_engine_cs.c2
-rw-r--r--drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c4
-rw-r--r--drivers/gpu/drm/i915/gt/intel_execlists_submission.c4
-rw-r--r--drivers/gpu/drm/i915/gt/intel_ggtt.c4
-rw-r--r--drivers/gpu/drm/i915/gt/intel_gsc.c2
-rw-r--r--drivers/gpu/drm/i915/gt/intel_gt.c4
-rw-r--r--drivers/gpu/drm/i915/gt/intel_gt_pm.c2
-rw-r--r--drivers/gpu/drm/i915/gt/intel_lrc.c6
-rw-r--r--drivers/gpu/drm/i915/gt/intel_migrate.c2
-rw-r--r--drivers/gpu/drm/i915/gt/intel_rc6.c2
-rw-r--r--drivers/gpu/drm/i915/gt/intel_rps.c2
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_context.c2
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_ring_submission.c2
-rw-r--r--drivers/gpu/drm/i915/gt/selftest_timeline.c2
-rw-r--r--drivers/gpu/drm/i915/gt/uc/intel_uc.c2
-rw-r--r--drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c2
-rw-r--r--drivers/gpu/drm/i915/i915_utils.h1
-rw-r--r--drivers/hv/ring_buffer.c2
-rw-r--r--drivers/hv/vmbus_drv.c1
-rw-r--r--drivers/iommu/iommu-sva.c12
-rw-r--r--drivers/vfio/vfio_iommu_type1.c2
-rw-r--r--drivers/vhost/vhost.c3
-rw-r--r--drivers/virt/coco/sev-guest/sev-guest.c99
-rw-r--r--fs/aio.c2
-rw-r--r--fs/proc/array.c13
-rw-r--r--fs/proc/task_mmu.c12
-rw-r--r--include/asm-generic/hugetlb.h4
-rw-r--r--include/asm-generic/mshyperv.h2
-rw-r--r--include/linux/cpu.h2
-rw-r--r--include/linux/iommu.h10
-rw-r--r--include/linux/livepatch.h1
-rw-r--r--include/linux/livepatch_sched.h29
-rw-r--r--include/linux/llist.h6
-rw-r--r--include/linux/mm.h76
-rw-r--r--include/linux/mman.h4
-rw-r--r--include/linux/mmu_context.h14
-rw-r--r--include/linux/non-atomic/xchg.h19
-rw-r--r--include/linux/objtool.h81
-rw-r--r--include/linux/objtool_types.h57
-rw-r--r--include/linux/pgtable.h14
-rw-r--r--include/linux/proc_fs.h2
-rw-r--r--include/linux/sched.h20
-rw-r--r--include/linux/smp.h11
-rw-r--r--include/linux/syscalls.h1
-rw-r--r--include/linux/uaccess.h22
-rw-r--r--include/trace/events/ipi.h44
-rw-r--r--include/uapi/asm-generic/siginfo.h3
-rw-r--r--include/uapi/asm-generic/unistd.h2
-rw-r--r--include/uapi/linux/elf.h2
-rw-r--r--include/uapi/linux/psp-sev.h7
-rw-r--r--include/uapi/linux/sev-guest.h18
-rw-r--r--init/main.c19
-rw-r--r--ipc/shm.c2
-rw-r--r--kernel/irq_work.c12
-rw-r--r--kernel/livepatch/core.c1
-rw-r--r--kernel/livepatch/transition.c122
-rw-r--r--kernel/locking/rwbase_rt.c9
-rw-r--r--kernel/sched/core.c168
-rw-r--r--kernel/sched/deadline.c10
-rw-r--r--kernel/sched/fair.c20
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/rt.c19
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/sched/smp.h2
-rw-r--r--kernel/sched/topology.c4
-rw-r--r--kernel/smp.c313
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--lib/Kconfig.debug9
-rw-r--r--lib/vdso/Makefile13
-rw-r--r--mm/debug_vm_pgtable.c16
-rw-r--r--mm/gup.c6
-rw-r--r--mm/huge_memory.c7
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/internal.h4
-rw-r--r--mm/madvise.c5
-rw-r--r--mm/memory.c5
-rw-r--r--mm/migrate.c11
-rw-r--r--mm/migrate_device.c2
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/userfaultfd.c2
-rw-r--r--mm/util.c2
-rw-r--r--scripts/sorttable.h2
-rw-r--r--tools/arch/x86/include/asm/orc_types.h12
-rw-r--r--tools/arch/x86/kcpuid/cpuid.csv61
-rw-r--r--tools/arch/x86/kcpuid/kcpuid.c32
-rw-r--r--tools/include/linux/objtool.h200
-rw-r--r--tools/include/linux/objtool_types.h57
-rw-r--r--tools/objtool/check.c70
-rw-r--r--tools/objtool/include/objtool/check.h4
-rw-r--r--tools/objtool/orc_dump.c15
-rw-r--r--tools/objtool/orc_gen.c43
-rwxr-xr-xtools/objtool/sync-check.sh2
-rw-r--r--tools/testing/selftests/x86/Makefile2
-rw-r--r--tools/testing/selftests/x86/lam.c1241
-rw-r--r--tools/testing/selftests/x86/test_shadow_stack.c695
-rw-r--r--virt/kvm/kvm_main.c3
346 files changed, 6245 insertions, 1923 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9b70773fdcd8..6a6ede750c5c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -912,15 +912,14 @@
cs89x0_media= [HW,NET]
Format: { rj45 | aui | bnc }
- csdlock_debug= [KNL] Enable debug add-ons of cross-CPU function call
- handling. When switched on, additional debug data is
- printed to the console in case a hanging CPU is
- detected, and that CPU is pinged again in order to try
- to resolve the hang situation.
- 0: disable csdlock debugging (default)
- 1: enable basic csdlock debugging (minor impact)
- ext: enable extended csdlock debugging (more impact,
- but more data)
+ csdlock_debug= [KNL] Enable or disable debug add-ons of cross-CPU
+ function call handling. When switched on,
+ additional debug data is printed to the console
+ in case a hanging CPU is detected, and that
+ CPU is pinged again in order to try to resolve
+ the hang situation. The default value of this
+ option depends on the CSD_LOCK_WAIT_DEBUG_DEFAULT
+ Kconfig option.
dasd= [HW,NET]
See header of drivers/s390/block/dasd_devmap.c.
diff --git a/Documentation/arch/x86/index.rst b/Documentation/arch/x86/index.rst
index c73d133fd37c..8ac64d7de4dc 100644
--- a/Documentation/arch/x86/index.rst
+++ b/Documentation/arch/x86/index.rst
@@ -22,6 +22,7 @@ x86-specific Documentation
mtrr
pat
intel-hfi
+ shstk
iommu
intel_txt
amd-memory-encryption
diff --git a/Documentation/arch/x86/microcode.rst b/Documentation/arch/x86/microcode.rst
index b627c6f36bcf..15b52e2b181d 100644
--- a/Documentation/arch/x86/microcode.rst
+++ b/Documentation/arch/x86/microcode.rst
@@ -208,6 +208,16 @@ Basically there is no way to declare a new microcode update suitable
for late-loading. This is another one of the problems that caused late
loading to be not enabled by default.
+AMD
+---
+
+Late loading on AMD does not have the concurrency issues described
+above: when loading is attempted on T0, the T1 is quiesced and does not
+execute instructions. Therefore, even if a higher priority interrupt or
+a fault happens, the whole core will see it either before the microcode
+patch has been applied or after. In either case, T0 and T1 will have the
+same microcode revision and nothing intermediate.
+
Builtin microcode
=================
diff --git a/Documentation/arch/x86/shstk.rst b/Documentation/arch/x86/shstk.rst
new file mode 100644
index 000000000000..60260e809baf
--- /dev/null
+++ b/Documentation/arch/x86/shstk.rst
@@ -0,0 +1,179 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================================================
+Control-flow Enforcement Technology (CET) Shadow Stack
+======================================================
+
+CET Background
+==============
+
+Control-flow Enforcement Technology (CET) covers several related x86 processor
+features that provide protection against control flow hijacking attacks. CET
+can protect both applications and the kernel.
+
+CET introduces shadow stack and indirect branch tracking (IBT). A shadow stack
+is a secondary stack allocated from memory which cannot be directly modified by
+applications. When executing a CALL instruction, the processor pushes the
+return address to both the normal stack and the shadow stack. Upon
+function return, the processor pops the shadow stack copy and compares it
+to the normal stack copy. If the two differ, the processor raises a
+control-protection fault. IBT verifies indirect CALL/JMP targets are intended
+as marked by the compiler with 'ENDBR' opcodes. Not all CPU's have both Shadow
+Stack and Indirect Branch Tracking. Today in the 64-bit kernel, only userspace
+shadow stack and kernel IBT are supported.
+
+Requirements to use Shadow Stack
+================================
+
+To use userspace shadow stack you need HW that supports it, a kernel
+configured with it and userspace libraries compiled with it.
+
+The kernel Kconfig option is X86_USER_SHADOW_STACK. When compiled in, shadow
+stacks can be disabled at runtime with the kernel parameter: nousershstk.
+
+To build a user shadow stack enabled kernel, Binutils v2.29 or LLVM v6 or later
+are required.
+
+At run time, /proc/cpuinfo shows CET features if the processor supports
+CET. "user_shstk" means that userspace shadow stack is supported on the current
+kernel and HW.
+
+Application Enabling
+====================
+
+An application's CET capability is marked in its ELF note and can be verified
+from readelf/llvm-readelf output::
+
+ readelf -n <application> | grep -a SHSTK
+ properties: x86 feature: SHSTK
+
+The kernel does not process these applications markers directly. Applications
+or loaders must enable CET features using the interface described in section 4.
+Typically this would be done in dynamic loader or static runtime objects, as is
+the case in GLIBC.
+
+Enabling arch_prctl()'s
+=======================
+
+Elf features should be enabled by the loader using the below arch_prctl's. They
+are only supported in 64 bit user applications. These operate on the features
+on a per-thread basis. The enablement status is inherited on clone, so if the
+feature is enabled on the first thread, it will propagate to all the thread's
+in an app.
+
+arch_prctl(ARCH_SHSTK_ENABLE, unsigned long feature)
+ Enable a single feature specified in 'feature'. Can only operate on
+ one feature at a time.
+
+arch_prctl(ARCH_SHSTK_DISABLE, unsigned long feature)
+ Disable a single feature specified in 'feature'. Can only operate on
+ one feature at a time.
+
+arch_prctl(ARCH_SHSTK_LOCK, unsigned long features)
+ Lock in features at their current enabled or disabled status. 'features'
+ is a mask of all features to lock. All bits set are processed, unset bits
+ are ignored. The mask is ORed with the existing value. So any feature bits
+ set here cannot be enabled or disabled afterwards.
+
+arch_prctl(ARCH_SHSTK_UNLOCK, unsigned long features)
+ Unlock features. 'features' is a mask of all features to unlock. All
+ bits set are processed, unset bits are ignored. Only works via ptrace.
+
+arch_prctl(ARCH_SHSTK_STATUS, unsigned long addr)
+ Copy the currently enabled features to the address passed in addr. The
+ features are described using the bits passed into the others in
+ 'features'.
+
+The return values are as follows. On success, return 0. On error, errno can
+be::
+
+ -EPERM if any of the passed feature are locked.
+ -ENOTSUPP if the feature is not supported by the hardware or
+ kernel.
+ -EINVAL arguments (non existing feature, etc)
+ -EFAULT if could not copy information back to userspace
+
+The feature's bits supported are::
+
+ ARCH_SHSTK_SHSTK - Shadow stack
+ ARCH_SHSTK_WRSS - WRSS
+
+Currently shadow stack and WRSS are supported via this interface. WRSS
+can only be enabled with shadow stack, and is automatically disabled
+if shadow stack is disabled.
+
+Proc Status
+===========
+To check if an application is actually running with shadow stack, the
+user can read the /proc/$PID/status. It will report "wrss" or "shstk"
+depending on what is enabled. The lines look like this::
+
+ x86_Thread_features: shstk wrss
+ x86_Thread_features_locked: shstk wrss
+
+Implementation of the Shadow Stack
+==================================
+
+Shadow Stack Size
+-----------------
+
+A task's shadow stack is allocated from memory to a fixed size of
+MIN(RLIMIT_STACK, 4 GB). In other words, the shadow stack is allocated to
+the maximum size of the normal stack, but capped to 4 GB. In the case
+of the clone3 syscall, there is a stack size passed in and shadow stack
+uses this instead of the rlimit.
+
+Signal
+------
+
+The main program and its signal handlers use the same shadow stack. Because
+the shadow stack stores only return addresses, a large shadow stack covers
+the condition that both the program stack and the signal alternate stack run
+out.
+
+When a signal happens, the old pre-signal state is pushed on the stack. When
+shadow stack is enabled, the shadow stack specific state is pushed onto the
+shadow stack. Today this is only the old SSP (shadow stack pointer), pushed
+in a special format with bit 63 set. On sigreturn this old SSP token is
+verified and restored by the kernel. The kernel will also push the normal
+restorer address to the shadow stack to help userspace avoid a shadow stack
+violation on the sigreturn path that goes through the restorer.
+
+So the shadow stack signal frame format is as follows::
+
+ |1...old SSP| - Pointer to old pre-signal ssp in sigframe token format
+ (bit 63 set to 1)
+ | ...| - Other state may be added in the future
+
+
+32 bit ABI signals are not supported in shadow stack processes. Linux prevents
+32 bit execution while shadow stack is enabled by the allocating shadow stacks
+outside of the 32 bit address space. When execution enters 32 bit mode, either
+via far call or returning to userspace, a #GP is generated by the hardware
+which, will be delivered to the process as a segfault. When transitioning to
+userspace the register's state will be as if the userspace ip being returned to
+caused the segfault.
+
+Fork
+----
+
+The shadow stack's vma has VM_SHADOW_STACK flag set; its PTEs are required
+to be read-only and dirty. When a shadow stack PTE is not RO and dirty, a
+shadow access triggers a page fault with the shadow stack access bit set
+in the page fault error code.
+
+When a task forks a child, its shadow stack PTEs are copied and both the
+parent's and the child's shadow stack PTEs are cleared of the dirty bit.
+Upon the next shadow stack access, the resulting shadow stack page fault
+is handled by page copy/re-use.
+
+When a pthread child is created, the kernel allocates a new shadow stack
+for the new thread. New shadow stack creation behaves like mmap() with respect
+to ASLR behavior. Similarly, on thread exit the thread's shadow stack is
+disabled.
+
+Exec
+----
+
+On exec, shadow stack features are disabled by the kernel. At which point,
+userspace can choose to re-enable, or lock them.
diff --git a/Documentation/arch/x86/xstate.rst b/Documentation/arch/x86/xstate.rst
index 5cec7fb558d6..ae5c69e48b11 100644
--- a/Documentation/arch/x86/xstate.rst
+++ b/Documentation/arch/x86/xstate.rst
@@ -11,6 +11,22 @@ are enabled by XCR0 as well, but the first use of related instruction is
trapped by the kernel because by default the required large XSTATE buffers
are not allocated automatically.
+The purpose for dynamic features
+--------------------------------
+
+Legacy userspace libraries often have hard-coded, static sizes for
+alternate signal stacks, often using MINSIGSTKSZ which is typically 2KB.
+That stack must be able to store at *least* the signal frame that the
+kernel sets up before jumping into the signal handler. That signal frame
+must include an XSAVE buffer defined by the CPU.
+
+However, that means that the size of signal stacks is dynamic, not static,
+because different CPUs have differently-sized XSAVE buffers. A compiled-in
+size of 2KB with existing applications is too small for new CPU features
+like AMX. Instead of universally requiring larger stack, with the dynamic
+enabling, the kernel can enforce userspace applications to have
+properly-sized altstacks.
+
Using dynamically enabled XSTATE features in user space applications
--------------------------------------------------------------------
@@ -64,6 +80,61 @@ the handler allocates a larger xstate buffer for the task so the large
state can be context switched. In the unlikely cases that the allocation
fails, the kernel sends SIGSEGV.
+AMX TILE_DATA enabling example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Below is the example of how userspace applications enable
+TILE_DATA dynamically:
+
+ 1. The application first needs to query the kernel for AMX
+ support::
+
+ #include <asm/prctl.h>
+ #include <sys/syscall.h>
+ #include <stdio.h>
+ #include <unistd.h>
+
+ #ifndef ARCH_GET_XCOMP_SUPP
+ #define ARCH_GET_XCOMP_SUPP 0x1021
+ #endif
+
+ #ifndef ARCH_XCOMP_TILECFG
+ #define ARCH_XCOMP_TILECFG 17
+ #endif
+
+ #ifndef ARCH_XCOMP_TILEDATA
+ #define ARCH_XCOMP_TILEDATA 18
+ #endif
+
+ #define MASK_XCOMP_TILE ((1 << ARCH_XCOMP_TILECFG) | \
+ (1 << ARCH_XCOMP_TILEDATA))
+
+ unsigned long features;
+ long rc;
+
+ ...
+
+ rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_SUPP, &features);
+
+ if (!rc && (features & MASK_XCOMP_TILE) == MASK_XCOMP_TILE)
+ printf("AMX is available.\n");
+
+ 2. After that, determining support for AMX, an application must
+ explicitly ask permission to use it::
+
+ #ifndef ARCH_REQ_XCOMP_PERM
+ #define ARCH_REQ_XCOMP_PERM 0x1023
+ #endif
+
+ ...
+
+ rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, ARCH_XCOMP_TILEDATA);
+
+ if (!rc)
+ printf("AMX is ready for use.\n");
+
+Note this example does not include the sigaltstack preparation.
+
Dynamic features in signal frames
---------------------------------
@@ -72,3 +143,32 @@ entry if the feature is in its initial configuration. This differs from
non-dynamic features which are always written regardless of their
configuration. Signal handlers can examine the XSAVE buffer's XSTATE_BV
field to determine if a features was written.
+
+Dynamic features for virtual machines
+-------------------------------------
+
+The permission for the guest state component needs to be managed separately
+from the host, as they are exclusive to each other. A coupled of options
+are extended to control the guest permission:
+
+-ARCH_GET_XCOMP_GUEST_PERM
+
+ arch_prctl(ARCH_GET_XCOMP_GUEST_PERM, &features);
+
+ ARCH_GET_XCOMP_GUEST_PERM is a variant of ARCH_GET_XCOMP_PERM. So it
+ provides the same semantics and functionality but for the guest
+ components.
+
+-ARCH_REQ_XCOMP_GUEST_PERM
+
+ arch_prctl(ARCH_REQ_XCOMP_GUEST_PERM, feature_nr);
+
+ ARCH_REQ_XCOMP_GUEST_PERM is a variant of ARCH_REQ_XCOMP_PERM. It has the
+ same semantics for the guest permission. While providing a similar
+ functionality, this comes with a constraint. Permission is frozen when the
+ first VCPU is created. Any attempt to change permission after that point
+ is going to be rejected. So, the permission has to be requested before the
+ first VCPU creation.
+
+Note that some VMMs may have already established a set of supported state
+components. These options are not presumed to support any particular VMM.
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 8e02ebe093ca..bfefcbb8f82b 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -564,6 +564,7 @@ encoded manner. The codes are the following:
mt arm64 MTE allocation tags are enabled
um userfaultfd missing tracking
uw userfaultfd wr-protect tracking
+ ss shadow stack page
== =======================================
Note that there is no guarantee that every flag and associated mnemonic will
diff --git a/Documentation/livepatch/reliable-stacktrace.rst b/Documentation/livepatch/reliable-stacktrace.rst
index 67459d2ca2af..d56bb706172f 100644
--- a/Documentation/livepatch/reliable-stacktrace.rst
+++ b/Documentation/livepatch/reliable-stacktrace.rst
@@ -183,7 +183,7 @@ trampoline or return trampoline. For example, considering the x86_64
.. code-block:: none
SYM_CODE_START(return_to_handler)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
subq $24, %rsp
/* Save the return values */
diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst
index af3891f895b0..c19a93c44e4c 100644
--- a/Documentation/mm/arch_pgtable_helpers.rst
+++ b/Documentation/mm/arch_pgtable_helpers.rst
@@ -46,7 +46,8 @@ PTE Page Table Helpers
+---------------------------+--------------------------------------------------+
| pte_mkclean | Creates a clean PTE |
+---------------------------+--------------------------------------------------+
-| pte_mkwrite | Creates a writable PTE |
+| pte_mkwrite | Creates a writable PTE of the type specified by |
+| | the VMA. |
+---------------------------+--------------------------------------------------+
| pte_wrprotect | Creates a write protected PTE |
+---------------------------+--------------------------------------------------+
@@ -118,7 +119,8 @@ PMD Page Table Helpers
+---------------------------+--------------------------------------------------+
| pmd_mkclean | Creates a clean PMD |
+---------------------------+--------------------------------------------------+
-| pmd_mkwrite | Creates a writable PMD |
+| pmd_mkwrite | Creates a writable PMD of the type specified by |
+| | the VMA. |
+---------------------------+--------------------------------------------------+
| pmd_wrprotect | Creates a write protected PMD |
+---------------------------+--------------------------------------------------+
@@ -222,7 +224,8 @@ HugeTLB Page Table Helpers
+---------------------------+--------------------------------------------------+
| huge_pte_mkdirty | Creates a dirty HugeTLB |
+---------------------------+--------------------------------------------------+
-| huge_pte_mkwrite | Creates a writable HugeTLB |
+| huge_pte_mkwrite | Creates a writable HugeTLB of the type specified |
+| | by the VMA. |
+---------------------------+--------------------------------------------------+
| huge_pte_wrprotect | Creates a write protected HugeTLB |
+---------------------------+--------------------------------------------------+
diff --git a/Documentation/virt/coco/sev-guest.rst b/Documentation/virt/coco/sev-guest.rst
index bf593e88cfd9..68b0d2363af8 100644
--- a/Documentation/virt/coco/sev-guest.rst
+++ b/Documentation/virt/coco/sev-guest.rst
@@ -37,11 +37,11 @@ along with a description:
the return value. General error numbers (-ENOMEM, -EINVAL)
are not detailed, but errors with specific meanings are.
-The guest ioctl should be issued on a file descriptor of the /dev/sev-guest device.
-The ioctl accepts struct snp_user_guest_request. The input and output structure is
-specified through the req_data and resp_data field respectively. If the ioctl fails
-to execute due to a firmware error, then fw_err code will be set otherwise the
-fw_err will be set to 0x00000000000000ff.
+The guest ioctl should be issued on a file descriptor of the /dev/sev-guest
+device. The ioctl accepts struct snp_user_guest_request. The input and
+output structure is specified through the req_data and resp_data field
+respectively. If the ioctl fails to execute due to a firmware error, then
+the fw_error code will be set, otherwise fw_error will be set to -1.
The firmware checks that the message sequence counter is one greater than
the guests message sequence counter. If guest driver fails to increment message
@@ -57,8 +57,14 @@ counter (e.g. counter overflow), then -EIO will be returned.
__u64 req_data;
__u64 resp_data;
- /* firmware error code on failure (see psp-sev.h) */
- __u64 fw_err;
+ /* bits[63:32]: VMM error code, bits[31:0] firmware error code (see psp-sev.h) */
+ union {
+ __u64 exitinfo2;
+ struct {
+ __u32 fw_error;
+ __u32 vmm_error;
+ };
+ };
};
2.1 SNP_GET_REPORT
diff --git a/MAINTAINERS b/MAINTAINERS
index 0623fdfdbcdf..49321684cd8e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15161,8 +15161,8 @@ OBJTOOL
M: Josh Poimboeuf <jpoimboe@kernel.org>
M: Peter Zijlstra <peterz@infradead.org>
S: Supported
+F: include/linux/objtool*.h
F: tools/objtool/
-F: include/linux/objtool.h
OCELOT ETHERNET SWITCH DRIVER
M: Vladimir Oltean <vladimir.oltean@nxp.com>
@@ -22766,6 +22766,17 @@ S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/asm
F: arch/x86/entry/
+X86 HARDWARE VULNERABILITIES
+M: Thomas Gleixner <tglx@linutronix.de>
+M: Borislav Petkov <bp@alien8.de>
+M: Peter Zijlstra <peterz@infradead.org>
+M: Josh Poimboeuf <jpoimboe@kernel.org>
+R: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
+S: Maintained
+F: Documentation/admin-guide/hw-vuln/
+F: arch/x86/include/asm/nospec-branch.h
+F: arch/x86/kernel/cpu/bugs.c
+
X86 MCE INFRASTRUCTURE
M: Tony Luck <tony.luck@intel.com>
M: Borislav Petkov <bp@alien8.de>
diff --git a/arch/alpha/include/asm/cmpxchg.h b/arch/alpha/include/asm/cmpxchg.h
index 6e0a850aa9d3..91d4a4d9258c 100644
--- a/arch/alpha/include/asm/cmpxchg.h
+++ b/arch/alpha/include/asm/cmpxchg.h
@@ -6,15 +6,15 @@
* Atomic exchange routines.
*/
-#define ____xchg(type, args...) __xchg ## type ## _local(args)
+#define ____xchg(type, args...) __arch_xchg ## type ## _local(args)
#define ____cmpxchg(type, args...) __cmpxchg ## type ## _local(args)
#include <asm/xchg.h>
#define xchg_local(ptr, x) \
({ \
__typeof__(*(ptr)) _x_ = (x); \
- (__typeof__(*(ptr))) __xchg_local((ptr), (unsigned long)_x_, \
- sizeof(*(ptr))); \
+ (__typeof__(*(ptr))) __arch_xchg_local((ptr), (unsigned long)_x_,\
+ sizeof(*(ptr))); \
})
#define arch_cmpxchg_local(ptr, o, n) \
@@ -34,7 +34,7 @@
#undef ____xchg
#undef ____cmpxchg
-#define ____xchg(type, args...) __xchg ##type(args)
+#define ____xchg(type, args...) __arch_xchg ##type(args)
#define ____cmpxchg(type, args...) __cmpxchg ##type(args)
#include <asm/xchg.h>
@@ -48,7 +48,7 @@
__typeof__(*(ptr)) _x_ = (x); \
smp_mb(); \
__ret = (__typeof__(*(ptr))) \
- __xchg((ptr), (unsigned long)_x_, sizeof(*(ptr))); \
+ __arch_xchg((ptr), (unsigned long)_x_, sizeof(*(ptr))); \
smp_mb(); \
__ret; \
})
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index ba43cb841d19..fb5d207c2a89 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -256,9 +256,13 @@ extern inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED;
extern inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) |= _PAGE_FOW; return pte; }
extern inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~(__DIRTY_BITS); return pte; }
extern inline pte_t pte_mkold(pte_t pte) { pte_val(pte) &= ~(__ACCESS_BITS); return pte; }
-extern inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) &= ~_PAGE_FOW; return pte; }
extern inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= __DIRTY_BITS; return pte; }
extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; return pte; }
+extern inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+ pte_val(pte) &= ~_PAGE_FOW;
+ return pte;
+}
/*
* The smp_rmb() in the following functions are required to order the load of
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index e9cf7193eb81..582d96548385 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -9,6 +9,7 @@
* This file handles the architecture-dependent parts of process handling.
*/
+#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/sched.h>
@@ -59,9 +60,10 @@ void arch_cpu_idle(void)
wtint(0);
}
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
wtint(INT_MAX);
+ BUG();
}
#endif /* ALPHA_WTINT */
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 0ede4b044e86..7439b2377df5 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -562,7 +562,7 @@ handle_ipi(struct pt_regs *regs)
}
void
-smp_send_reschedule(int cpu)
+arch_smp_send_reschedule(int cpu)
{
#ifdef DEBUG_IPI_MSG
if (cpu == hard_smp_processor_id())
diff --git a/arch/arc/include/asm/cmpxchg.h b/arch/arc/include/asm/cmpxchg.h
index c5b544a5fe81..e138fde067de 100644
--- a/arch/arc/include/asm/cmpxchg.h
+++ b/arch/arc/include/asm/cmpxchg.h
@@ -85,7 +85,7 @@
*/
#ifdef CONFIG_ARC_HAS_LLSC
-#define __xchg(ptr, val) \
+#define __arch_xchg(ptr, val) \
({ \
__asm__ __volatile__( \
" ex %0, [%1] \n" /* set new value */ \
@@ -102,7 +102,7 @@
\
switch(sizeof(*(_p_))) { \
case 4: \
- _val_ = __xchg(_p_, _val_); \
+ _val_ = __arch_xchg(_p_, _val_); \
break; \
default: \
BUILD_BUG(); \
diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h
index 5001b796fb8d..223a96967188 100644
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@@ -21,7 +21,7 @@ static inline pmd_t pte_pmd(pte_t pte)
}
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
-#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkwrite(pmd, vma) pte_pmd(pte_mkwrite(pmd_pte(pmd), (vma)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h
index 6e9f8ca6d6a1..a5b8bc955015 100644
--- a/arch/arc/include/asm/pgtable-bits-arcv2.h
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@@ -87,7 +87,6 @@
PTE_BIT_FUNC(mknotpresent, &= ~(_PAGE_PRESENT));
PTE_BIT_FUNC(wrprotect, &= ~(_PAGE_WRITE));
-PTE_BIT_FUNC(mkwrite, |= (_PAGE_WRITE));
PTE_BIT_FUNC(mkclean, &= ~(_PAGE_DIRTY));
PTE_BIT_FUNC(mkdirty, |= (_PAGE_DIRTY));
PTE_BIT_FUNC(mkold, &= ~(_PAGE_ACCESSED));
@@ -95,6 +94,12 @@ PTE_BIT_FUNC(mkyoung, |= (_PAGE_ACCESSED));
PTE_BIT_FUNC(mkspecial, |= (_PAGE_SPECIAL));
PTE_BIT_FUNC(mkhuge, |= (_PAGE_HW_SZ));
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+ pte_val(pte) |= (_PAGE_WRITE);
+ return pte;
+}
+
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index ad93fe6e4b77..409cfa4675b4 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -292,7 +292,7 @@ static void ipi_send_msg(const struct cpumask *callmap, enum ipi_msg_type msg)
ipi_send_msg_one(cpu, msg);
}
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
ipi_send_msg_one(cpu, IPI_RESCHEDULE);
}
diff --git a/arch/arm/include/asm/cmpxchg.h b/arch/arm/include/asm/cmpxchg.h
index 4dfe538dfc68..44667bdb4707 100644
--- a/arch/arm/include/asm/cmpxchg.h
+++ b/arch/arm/include/asm/cmpxchg.h
@@ -25,7 +25,8 @@
#define swp_is_buggy
#endif
-static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size)
+static inline unsigned long
+__arch_xchg(unsigned long x, volatile void *ptr, int size)
{
extern void __bad_xchg(volatile void *, int);
unsigned long ret;
@@ -115,8 +116,8 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
}
#define arch_xchg_relaxed(ptr, x) ({ \
- (__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), \
- sizeof(*(ptr))); \
+ (__typeof__(*(ptr)))__arch_xchg((unsigned long)(x), (ptr), \
+ sizeof(*(ptr))); \
})
#include <asm-generic/cmpxchg-local.h>
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h
index 106049791500..df071a807610 100644
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -202,11 +202,16 @@ static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
PMD_BIT_FUNC(wrprotect, |= L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF);
-PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY);
PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY);
PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY);
PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF);
+static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+ pmd_val(pmd) |= L_PMD_SECT_RDONLY;
+ return pmd;
+}
+
#define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
#define pmd_pfn(pmd) (((pmd_val(pmd) & PMD_MASK) & PHYS_MASK) >> PAGE_SHIFT)
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index a58ccbb406ad..39ad1ae1308d 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -227,7 +227,7 @@ static inline pte_t pte_wrprotect(pte_t pte)
return set_pte_bit(pte, __pgprot(L_PTE_RDONLY));
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
return clear_pte_bit(pte, __pgprot(L_PTE_RDONLY));
}
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index e07f359254c3..9a3c9de5ac5e 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -681,7 +681,7 @@ asmlinkage void do_rseq_syscall(struct pt_regs *regs)
*/
static_assert(NSIGILL == 11);
static_assert(NSIGFPE == 15);
-static_assert(NSIGSEGV == 9);
+static_assert(NSIGSEGV == 10);
static_assert(NSIGBUS == 5);
static_assert(NSIGTRAP == 6);
static_assert(NSIGCHLD == 6);
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 0b8c25763adc..a8718b1a413b 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -48,7 +48,6 @@
#include <asm/mach/arch.h>
#include <asm/mpu.h>
-#define CREATE_TRACE_POINTS
#include <trace/events/ipi.h>
/*
@@ -320,7 +319,7 @@ void __cpu_die(unsigned int cpu)
* of the other hotplug-cpu capable cores, so presumably coming
* out of idle fixes this.
*/
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
unsigned int cpu = smp_processor_id();
@@ -382,6 +381,8 @@ void arch_cpu_idle_dead(void)
: "r" (task_stack_page(current) + THREAD_SIZE - 8),
"r" (current)
: "r0");
+
+ unreachable();
}
#endif /* CONFIG_HOTPLUG_CPU */
@@ -747,7 +748,7 @@ void __init set_smp_ipi_range(int ipi_base, int n)
ipi_setup(smp_processor_id());
}
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE);
}
diff --git a/arch/arm/mach-actions/platsmp.c b/arch/arm/mach-actions/platsmp.c
index f26618b43514..7b208e96fbb6 100644
--- a/arch/arm/mach-actions/platsmp.c
+++ b/arch/arm/mach-actions/platsmp.c
@@ -20,6 +20,8 @@
#include <asm/smp_plat.h>
#include <asm/smp_scu.h>
+#include <trace/events/ipi.h>
+
#define OWL_CPU1_ADDR 0x50
#define OWL_CPU1_FLAG 0x5c
diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c
index f5f790c6e5f8..77ce8ae43137 100644
--- a/arch/arm/probes/uprobes/core.c
+++ b/arch/arm/probes/uprobes/core.c
@@ -9,6 +9,7 @@
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/uprobes.h>
+#include <linux/non-atomic/xchg.h>
#include <linux/notifier.h>
#include <asm/opcodes.h>
@@ -61,12 +62,7 @@ unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr,
struct pt_regs *regs)
{
- unsigned long orig_ret_vaddr;
-
- orig_ret_vaddr = regs->ARM_lr;
- /* Replace the return addr with trampoline addr */
- regs->ARM_lr = trampoline_vaddr;
- return orig_ret_vaddr;
+ return __xchg(&regs->ARM_lr, trampoline_vaddr);
}
int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index a7ec06ce3785..515ca33b854c 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -1,8 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
hostprogs := vdsomunge
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index 497acf134d99..c6bc5d8ec3ca 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -62,9 +62,8 @@ __XCHG_CASE( , , mb_, 64, dmb ish, nop, , a, l, "memory")
#undef __XCHG_CASE
#define __XCHG_GEN(sfx) \
-static __always_inline unsigned long __xchg##sfx(unsigned long x, \
- volatile void *ptr, \
- int size) \
+static __always_inline unsigned long \
+__arch_xchg##sfx(unsigned long x, volatile void *ptr, int size) \
{ \
switch (size) { \
case 1: \
@@ -93,7 +92,7 @@ __XCHG_GEN(_mb)
({ \
__typeof__(*(ptr)) __ret; \
__ret = (__typeof__(*(ptr))) \
- __xchg##sfx((unsigned long)(x), (ptr), sizeof(*(ptr))); \
+ __arch_xchg##sfx((unsigned long)(x), (ptr), sizeof(*(ptr))); \
__ret; \
})
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 72dbd6400549..56911691bef0 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -288,6 +288,12 @@ void post_ttbr_update_workaround(void);
unsigned long arm64_mm_context_get(struct mm_struct *mm);
void arm64_mm_context_put(struct mm_struct *mm);
+#define mm_untag_mask mm_untag_mask
+static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+{
+ return -1UL >> 8;
+}
+
#include <asm-generic/mmu_context.h>
#endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0bd18de9fd97..932190079fa2 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -180,13 +180,18 @@ static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot)
return pmd;
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite_kernel(pte_t pte)
{
pte = set_pte_bit(pte, __pgprot(PTE_WRITE));
pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
return pte;
}
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+ return pte_mkwrite_kernel(pte);
+}
+
static inline pte_t pte_mkclean(pte_t pte)
{
pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
@@ -487,7 +492,7 @@ static inline int pmd_trans_huge(pmd_t pmd)
#define pmd_cont(pmd) pte_cont(pmd_pte(pmd))
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
-#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkwrite(pmd, vma) pte_pmd(pte_mkwrite(pmd_pte(pmd), (vma)))
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index fc55f5a57a06..5733a31bab08 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -100,7 +100,7 @@ static inline void arch_send_wakeup_ipi_mask(const struct cpumask *mask)
extern int __cpu_disable(void);
extern void __cpu_die(unsigned int cpu);
-extern void cpu_die(void);
+extern void __noreturn cpu_die(void);
extern void cpu_die_early(void);
static inline void cpu_park_loop(void)
diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c
index d49aef2657cd..d7171d30ea66 100644
--- a/arch/arm64/kernel/probes/uprobes.c
+++ b/arch/arm64/kernel/probes/uprobes.c
@@ -3,6 +3,7 @@
* Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com>
*/
#include <linux/highmem.h>
+#include <linux/non-atomic/xchg.h>
#include <linux/ptrace.h>
#include <linux/uprobes.h>
#include <asm/cacheflush.h>
@@ -150,13 +151,7 @@ unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr,
struct pt_regs *regs)
{
- unsigned long orig_ret_vaddr;
-
- orig_ret_vaddr = procedure_link_pointer(regs);
- /* Replace the return addr with trampoline addr */
- procedure_link_pointer_set(regs, trampoline_vaddr);
-
- return orig_ret_vaddr;
+ return __xchg(&procedure_link_pointer(regs), trampoline_vaddr);
}
int arch_uprobe_exception_notify(struct notifier_block *self,
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 71d59b5abede..089ced6d6bd6 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -69,7 +69,7 @@ void (*pm_power_off)(void);
EXPORT_SYMBOL_GPL(pm_power_off);
#ifdef CONFIG_HOTPLUG_CPU
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
cpu_die();
}
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 06a02707f488..19b6b292892c 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -1341,7 +1341,7 @@ void __init minsigstksz_setup(void)
*/
static_assert(NSIGILL == 11);
static_assert(NSIGFPE == 15);
-static_assert(NSIGSEGV == 9);
+static_assert(NSIGSEGV == 10);
static_assert(NSIGBUS == 5);
static_assert(NSIGTRAP == 6);
static_assert(NSIGCHLD == 6);
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index 4700f8522d27..bbd542704730 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -460,7 +460,7 @@ void compat_setup_restart_syscall(struct pt_regs *regs)
*/
static_assert(NSIGILL == 11);
static_assert(NSIGFPE == 15);
-static_assert(NSIGSEGV == 9);
+static_assert(NSIGSEGV == 10);
static_assert(NSIGBUS == 5);
static_assert(NSIGTRAP == 6);
static_assert(NSIGCHLD == 6);
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 4e8327264255..6017c0cd9d2a 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -51,7 +51,6 @@
#include <asm/ptrace.h>
#include <asm/virt.h>
-#define CREATE_TRACE_POINTS
#include <trace/events/ipi.h>
DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
@@ -361,7 +360,7 @@ void __cpu_die(unsigned int cpu)
* Called from the idle thread for the CPU which has been shutdown.
*
*/
-void cpu_die(void)
+void __noreturn cpu_die(void)
{
unsigned int cpu = smp_processor_id();
const struct cpu_operations *ops = get_cpu_ops(cpu);
@@ -977,7 +976,7 @@ void __init set_smp_ipi_range(int ipi_base, int n)
ipi_setup(smp_processor_id());
}
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE);
}
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index beaf9586338f..fe7a53c6781f 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -6,9 +6,7 @@
# Heavily based on the vDSO Makefiles for other archs.
#
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_AARCH64_JUMP_SLOT|R_AARCH64_GLOB_DAT|R_AARCH64_ABS64
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso := vgettimeofday.o note.o sigreturn.o
diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile
index f59bd1a4ead6..d014162c5c71 100644
--- a/arch/arm64/kernel/vdso32/Makefile
+++ b/arch/arm64/kernel/vdso32/Makefile
@@ -3,9 +3,6 @@
# Makefile for vdso32
#
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_ARM_JUMP_SLOT|R_ARM_GLOB_DAT|R_ARM_ABS32
include $(srctree)/lib/vdso/Makefile
# Same as cc-*option, but using CC_COMPAT instead of CC
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index 4ea2eefbc053..5c07e68d80ea 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -40,7 +40,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
* read only (code, rodata). Clear the RDONLY bit from
* the temporary mappings we use during restore.
*/
- set_pte(dst_ptep, pte_mkwrite(pte));
+ set_pte(dst_ptep, pte_mkwrite_kernel(pte));
} else if (debug_pagealloc_enabled() && !pte_none(pte)) {
/*
* debug_pagealloc will removed the PTE_VALID bit if
@@ -53,7 +53,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
*/
BUG_ON(!pfn_valid(pte_pfn(pte)));
- set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte)));
+ set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_kernel(pte)));
}
}
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index d4042495febc..c2f92c991e37 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -176,7 +176,7 @@ static inline pte_t pte_mkold(pte_t pte)
return pte;
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
pte_val(pte) |= _PAGE_WRITE;
if (pte_val(pte) & _PAGE_MODIFIED)
diff --git a/arch/csky/kernel/probes/uprobes.c b/arch/csky/kernel/probes/uprobes.c
index 2d31a12e46cf..775fe88b5f00 100644
--- a/arch/csky/kernel/probes/uprobes.c
+++ b/arch/csky/kernel/probes/uprobes.c
@@ -3,6 +3,7 @@
* Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com>
*/
#include <linux/highmem.h>
+#include <linux/non-atomic/xchg.h>
#include <linux/ptrace.h>
#include <linux/uprobes.h>
#include <asm/cacheflush.h>
@@ -123,13 +124,7 @@ unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr,
struct pt_regs *regs)
{
- unsigned long ra;
-
- ra = regs->lr;
-
- regs->lr = trampoline_vaddr;
-
- return ra;
+ return __xchg(&regs->lr, trampoline_vaddr);
}
int arch_uprobe_exception_notify(struct notifier_block *self,
diff --git a/arch/csky/kernel/smp.c b/arch/csky/kernel/smp.c
index b45d1073307f..b12e2c3c387f 100644
--- a/arch/csky/kernel/smp.c
+++ b/arch/csky/kernel/smp.c
@@ -140,7 +140,7 @@ void smp_send_stop(void)
on_each_cpu(ipi_stop, NULL, 1);
}
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
send_ipi_message(cpumask_of(cpu), IPI_RESCHEDULE);
}
@@ -300,7 +300,7 @@ void __cpu_die(unsigned int cpu)
pr_notice("CPU%u: shutdown\n", cpu);
}
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
idle_task_exit();
@@ -317,5 +317,7 @@ void arch_cpu_idle_dead(void)
"jmpi csky_start_secondary"
:
: "r" (secondary_stack));
+
+ BUG();
}
#endif
diff --git a/arch/csky/kernel/vdso/Makefile b/arch/csky/kernel/vdso/Makefile
index 0b6909f10667..299e4e41ebc5 100644
--- a/arch/csky/kernel/vdso/Makefile
+++ b/arch/csky/kernel/vdso/Makefile
@@ -1,8 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_CKCORE_ADDR32|R_CKCORE_JUMP_SLOT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
# Symbols present in the vdso
diff --git a/arch/hexagon/include/asm/cmpxchg.h b/arch/hexagon/include/asm/cmpxchg.h
index cdb705e1496a..bf6cf5579cf4 100644
--- a/arch/hexagon/include/asm/cmpxchg.h
+++ b/arch/hexagon/include/asm/cmpxchg.h
@@ -9,7 +9,7 @@
#define _ASM_CMPXCHG_H
/*
- * __xchg - atomically exchange a register and a memory location
+ * __arch_xchg - atomically exchange a register and a memory location
* @x: value to swap
* @ptr: pointer to memory
* @size: size of the value
@@ -19,8 +19,8 @@
* Note: there was an errata for V2 about .new's and memw_locked.
*
*/
-static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
- int size)
+static inline unsigned long
+__arch_xchg(unsigned long x, volatile void *ptr, int size)
{
unsigned long retval;
@@ -42,8 +42,8 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
* Atomically swap the contents of a register with memory. Should be atomic
* between multiple CPU's and within interrupts on the same CPU.
*/
-#define arch_xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), \
- sizeof(*(ptr))))
+#define arch_xchg(ptr, v) ((__typeof__(*(ptr)))__arch_xchg((unsigned long)(v), (ptr), \
+ sizeof(*(ptr))))
/*
* see rt-mutex-design.txt; cmpxchg supposedly checks if *ptr == A and swaps.
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index 59393613d086..14ab9c789c0e 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -300,7 +300,7 @@ static inline pte_t pte_wrprotect(pte_t pte)
}
/* pte_mkwrite - mark page as writable */
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
pte_val(pte) |= _PAGE_WRITE;
return pte;
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index 4ba93e59370c..4e8bee25b8c6 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -217,7 +217,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
}
}
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
send_ipi(cpumask_of(cpu), IPI_RESCHEDULE);
}
diff --git a/arch/ia64/include/asm/cmpxchg.h b/arch/ia64/include/asm/cmpxchg.h
index 94ef84429843..8b2e644ef6a1 100644
--- a/arch/ia64/include/asm/cmpxchg.h
+++ b/arch/ia64/include/asm/cmpxchg.h
@@ -5,7 +5,7 @@
#include <uapi/asm/cmpxchg.h>
#define arch_xchg(ptr, x) \
-({(__typeof__(*(ptr))) __xchg((unsigned long) (x), (ptr), sizeof(*(ptr)));})
+({(__typeof__(*(ptr))) __arch_xchg((unsigned long) (x), (ptr), sizeof(*(ptr)));})
#define arch_cmpxchg(ptr, o, n) cmpxchg_acq((ptr), (o), (n))
#define arch_cmpxchg64(ptr, o, n) cmpxchg_acq((ptr), (o), (n))
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 21c97e31a28a..f879dd626da6 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -268,7 +268,7 @@ ia64_phys_addr_valid (unsigned long addr)
* access rights:
*/
#define pte_wrprotect(pte) (__pte(pte_val(pte) & ~_PAGE_AR_RW))
-#define pte_mkwrite(pte) (__pte(pte_val(pte) | _PAGE_AR_RW))
+#define pte_mkwrite(pte, vma) (__pte(pte_val(pte) | _PAGE_AR_RW))
#define pte_mkold(pte) (__pte(pte_val(pte) & ~_PAGE_A))
#define pte_mkyoung(pte) (__pte(pte_val(pte) | _PAGE_A))
#define pte_mkclean(pte) (__pte(pte_val(pte) & ~_PAGE_D))
diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h
index 259ae57570bf..85cba138311f 100644
--- a/arch/ia64/include/uapi/asm/cmpxchg.h
+++ b/arch/ia64/include/uapi/asm/cmpxchg.h
@@ -23,7 +23,7 @@
*/
extern void ia64_xchg_called_with_bad_pointer(void);
-#define __xchg(x, ptr, size) \
+#define __arch_xchg(x, ptr, size) \
({ \
unsigned long __xchg_result; \
\
@@ -51,7 +51,7 @@ extern void ia64_xchg_called_with_bad_pointer(void);
#ifndef __KERNEL__
#define xchg(ptr, x) \
-({(__typeof__(*(ptr))) __xchg((unsigned long) (x), (ptr), sizeof(*(ptr)));})
+({(__typeof__(*(ptr))) __arch_xchg((unsigned long) (x), (ptr), sizeof(*(ptr)));})
#endif
/*
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index f6195a0a00ae..9a5cd9fad3a9 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -201,7 +201,7 @@ __setup("nohalt", nohalt_setup);
#ifdef CONFIG_HOTPLUG_CPU
/* We don't actually take CPU down, just spin without interrupts. */
-static inline void play_dead(void)
+static inline void __noreturn play_dead(void)
{
unsigned int this_cpu = smp_processor_id();
@@ -219,13 +219,13 @@ static inline void play_dead(void)
BUG();
}
#else
-static inline void play_dead(void)
+static inline void __noreturn play_dead(void)
{
BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
play_dead();
}
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index e2cc59db86bc..ea4f009a232b 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -220,11 +220,11 @@ kdump_smp_send_init(void)
* Called with preemption disabled.
*/
void
-smp_send_reschedule (int cpu)
+arch_smp_send_reschedule (int cpu)
{
ia64_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0);
}
-EXPORT_SYMBOL_GPL(smp_send_reschedule);
+EXPORT_SYMBOL_GPL(arch_smp_send_reschedule);
/*
* Called with preemption disabled.
diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h
index ecfa6cf79806..979fde61bba8 100644
--- a/arch/loongarch/include/asm/cmpxchg.h
+++ b/arch/loongarch/include/asm/cmpxchg.h
@@ -62,7 +62,7 @@ static inline unsigned int __xchg_small(volatile void *ptr, unsigned int val,
}
static __always_inline unsigned long
-__xchg(volatile void *ptr, unsigned long x, int size)
+__arch_xchg(volatile void *ptr, unsigned long x, int size)
{
switch (size) {
case 1:
@@ -87,7 +87,7 @@ __xchg(volatile void *ptr, unsigned long x, int size)
__typeof__(*(ptr)) __res; \
\
__res = (__typeof__(*(ptr))) \
- __xchg((ptr), (unsigned long)(x), sizeof(*(ptr))); \
+ __arch_xchg((ptr), (unsigned long)(x), sizeof(*(ptr))); \
\
__res; \
})
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index d28fb9dbec59..ebf645f40298 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -390,7 +390,7 @@ static inline pte_t pte_mkdirty(pte_t pte)
return pte;
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
pte_val(pte) |= _PAGE_WRITE;
if (pte_val(pte) & _PAGE_MODIFIED)
@@ -490,7 +490,7 @@ static inline int pmd_write(pmd_t pmd)
return !!(pmd_val(pmd) & _PAGE_WRITE);
}
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
pmd_val(pmd) |= _PAGE_WRITE;
if (pmd_val(pmd) & _PAGE_MODIFIED)
diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h
index d82687390b4a..416b653bccb4 100644
--- a/arch/loongarch/include/asm/smp.h
+++ b/arch/loongarch/include/asm/smp.h
@@ -99,7 +99,7 @@ static inline void __cpu_die(unsigned int cpu)
loongson_cpu_die(cpu);
}
-extern void play_dead(void);
+extern void __noreturn play_dead(void);
#endif
#endif /* __ASM_SMP_H */
diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c
index fa2443c7afb2..b71e17c1cc0c 100644
--- a/arch/loongarch/kernel/process.c
+++ b/arch/loongarch/kernel/process.c
@@ -62,7 +62,7 @@ unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
EXPORT_SYMBOL(boot_option_idle_override);
#ifdef CONFIG_HOTPLUG_CPU
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
play_dead();
}
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index 8c6e227cb29d..ed167e244cda 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -155,11 +155,11 @@ void loongson_send_ipi_mask(const struct cpumask *mask, unsigned int action)
* it goes straight through and wastes no time serializing
* anything. Worst case is that we lose a reschedule ...
*/
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
loongson_send_ipi_single(cpu, SMP_RESCHEDULE);
}
-EXPORT_SYMBOL_GPL(smp_send_reschedule);
+EXPORT_SYMBOL_GPL(arch_smp_send_reschedule);
irqreturn_t loongson_ipi_interrupt(int irq, void *dev)
{
@@ -336,7 +336,7 @@ void play_dead(void)
iocsr_write32(0xffffffff, LOONGARCH_IOCSR_IPI_CLEAR);
init_fn();
- unreachable();
+ BUG();
}
#endif
diff --git a/arch/loongarch/vdso/Makefile b/arch/loongarch/vdso/Makefile
index d89e2ac75f7b..461240ab4436 100644
--- a/arch/loongarch/vdso/Makefile
+++ b/arch/loongarch/vdso/Makefile
@@ -1,9 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
# Objects to go into the VDSO.
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_LARCH_32|R_LARCH_64|R_LARCH_MARK_LA|R_LARCH_JUMP_SLOT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso-y := elf.o vgetcpu.o vgettimeofday.o sigreturn.o
diff --git a/arch/m68k/include/asm/cmpxchg.h b/arch/m68k/include/asm/cmpxchg.h
index 6cf464cdab06..d7f3de9c5d6f 100644
--- a/arch/m68k/include/asm/cmpxchg.h
+++ b/arch/m68k/include/asm/cmpxchg.h
@@ -9,7 +9,7 @@
extern unsigned long __invalid_xchg_size(unsigned long, volatile void *, int);
#ifndef CONFIG_RMW_INSNS
-static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
+static inline unsigned long __arch_xchg(unsigned long x, volatile void * ptr, int size)
{
unsigned long flags, tmp;
@@ -40,7 +40,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
return x;
}
#else
-static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
+static inline unsigned long __arch_xchg(unsigned long x, volatile void * ptr, int size)
{
switch (size) {
case 1:
@@ -75,7 +75,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
}
#endif
-#define arch_xchg(ptr,x) ({(__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)));})
+#define arch_xchg(ptr,x) ({(__typeof__(*(ptr)))__arch_xchg((unsigned long)(x),(ptr),sizeof(*(ptr)));})
#include <asm-generic/cmpxchg-local.h>
diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h
index d97fbb812f63..9abb78acc237 100644
--- a/arch/m68k/include/asm/mcf_pgtable.h
+++ b/arch/m68k/include/asm/mcf_pgtable.h
@@ -211,7 +211,7 @@ static inline pte_t pte_mkold(pte_t pte)
return pte;
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
pte_val(pte) |= CF_PAGE_WRITABLE;
return pte;
diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h
index ec0dc19ab834..c4e8eb76286d 100644
--- a/arch/m68k/include/asm/motorola_pgtable.h
+++ b/arch/m68k/include/asm/motorola_pgtable.h
@@ -155,7 +155,6 @@ static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED;
static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) |= _PAGE_RONLY; return pte; }
static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~_PAGE_DIRTY; return pte; }
static inline pte_t pte_mkold(pte_t pte) { pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) &= ~_PAGE_RONLY; return pte; }
static inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= _PAGE_DIRTY; return pte; }
static inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= _PAGE_ACCESSED; return pte; }
static inline pte_t pte_mknocache(pte_t pte)
@@ -168,6 +167,11 @@ static inline pte_t pte_mkcache(pte_t pte)
pte_val(pte) = (pte_val(pte) & _CACHEMASK040) | m68k_supervisor_cachemode;
return pte;
}
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+ pte_val(pte) &= ~_PAGE_RONLY;
+ return pte;
+}
#define swapper_pg_dir kernel_pg_dir
extern pgd_t kernel_pg_dir[128];
diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h
index e582b0484a55..2a06bea51a1e 100644
--- a/arch/m68k/include/asm/sun3_pgtable.h
+++ b/arch/m68k/include/asm/sun3_pgtable.h
@@ -143,10 +143,14 @@ static inline int pte_young(pte_t pte) { return pte_val(pte) & SUN3_PAGE_ACCESS
static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) &= ~SUN3_PAGE_WRITEABLE; return pte; }
static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~SUN3_PAGE_MODIFIED; return pte; }
static inline pte_t pte_mkold(pte_t pte) { pte_val(pte) &= ~SUN3_PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) |= SUN3_PAGE_WRITEABLE; return pte; }
static inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= SUN3_PAGE_MODIFIED; return pte; }
static inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= SUN3_PAGE_ACCESSED; return pte; }
static inline pte_t pte_mknocache(pte_t pte) { pte_val(pte) |= SUN3_PAGE_NOCACHE; return pte; }
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+ pte_val(pte) |= SUN3_PAGE_WRITEABLE;
+ return pte;
+}
// use this version when caches work...
//static inline pte_t pte_mkcache(pte_t pte) { pte_val(pte) &= SUN3_PAGE_NOCACHE; return pte; }
// until then, use:
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index d1b8272abcd9..5b83e82f8d7e 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -266,7 +266,7 @@ static inline pte_t pte_mkread(pte_t pte) \
{ pte_val(pte) |= _PAGE_USER; return pte; }
static inline pte_t pte_mkexec(pte_t pte) \
{ pte_val(pte) |= _PAGE_USER | _PAGE_EXEC; return pte; }
-static inline pte_t pte_mkwrite(pte_t pte) \
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) \
{ pte_val(pte) |= _PAGE_RW; return pte; }
static inline pte_t pte_mkdirty(pte_t pte) \
{ pte_val(pte) |= _PAGE_DIRTY; return pte; }
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c
index 89954f5f87fb..4212584e6efa 100644
--- a/arch/mips/cavium-octeon/smp.c
+++ b/arch/mips/cavium-octeon/smp.c
@@ -20,6 +20,7 @@
#include <asm/mmu_context.h>
#include <asm/time.h>
#include <asm/setup.h>
+#include <asm/smp.h>
#include <asm/octeon/octeon.h>
diff --git a/arch/mips/include/asm/cmpxchg.h b/arch/mips/include/asm/cmpxchg.h
index 7ec9493b2861..feed343ad483 100644
--- a/arch/mips/include/asm/cmpxchg.h
+++ b/arch/mips/include/asm/cmpxchg.h
@@ -68,7 +68,7 @@ extern unsigned long __xchg_small(volatile void *ptr, unsigned long val,
unsigned int size);
static __always_inline
-unsigned long __xchg(volatile void *ptr, unsigned long x, int size)
+unsigned long __arch_xchg(volatile void *ptr, unsigned long x, int size)
{
switch (size) {
case 1:
@@ -102,7 +102,7 @@ unsigned long __xchg(volatile void *ptr, unsigned long x, int size)
smp_mb__before_llsc(); \
\
__res = (__typeof__(*(ptr))) \
- __xchg((ptr), (unsigned long)(x), sizeof(*(ptr))); \
+ __arch_xchg((ptr), (unsigned long)(x), sizeof(*(ptr))); \
\
smp_llsc_mb(); \
\
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 574fa14ac8b2..a05cb342c80d 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -309,7 +309,7 @@ static inline pte_t pte_mkold(pte_t pte)
return pte;
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
pte.pte_low |= _PAGE_WRITE;
if (pte.pte_low & _PAGE_MODIFIED) {
@@ -364,7 +364,7 @@ static inline pte_t pte_mkold(pte_t pte)
return pte;
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
pte_val(pte) |= _PAGE_WRITE;
if (pte_val(pte) & _PAGE_MODIFIED)
@@ -627,7 +627,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
return pmd;
}
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
pmd_val(pmd) |= _PAGE_WRITE;
if (pmd_val(pmd) & _PAGE_MODIFIED)
diff --git a/arch/mips/include/asm/smp.h b/arch/mips/include/asm/smp.h
index 5d9ff61004ca..aab8981bc32c 100644
--- a/arch/mips/include/asm/smp.h
+++ b/arch/mips/include/asm/smp.h
@@ -66,7 +66,7 @@ extern void calculate_cpu_foreign_map(void);
* it goes straight through and wastes no time serializing
* anything. Worst case is that we lose a reschedule ...
*/
-static inline void smp_send_reschedule(int cpu)
+static inline void arch_smp_send_reschedule(int cpu)
{
extern const struct plat_smp_ops *mp_ops; /* private */
@@ -88,7 +88,7 @@ static inline void __cpu_die(unsigned int cpu)
mp_ops->cpu_die(cpu);
}
-extern void play_dead(void);
+extern void __noreturn play_dead(void);
#endif
#ifdef CONFIG_KEXEC
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 093dbbd6b843..a3225912c862 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -40,7 +40,7 @@
#include <asm/stacktrace.h>
#ifdef CONFIG_HOTPLUG_CPU
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
play_dead();
}
diff --git a/arch/mips/kernel/rtlx-cmp.c b/arch/mips/kernel/rtlx-cmp.c
index d26dcc4b46e7..e991cc936c1c 100644
--- a/arch/mips/kernel/rtlx-cmp.c
+++ b/arch/mips/kernel/rtlx-cmp.c
@@ -17,6 +17,8 @@
#include <asm/vpe.h>
#include <asm/rtlx.h>
+#include <trace/events/ipi.h>
+
static int major;
static void rtlx_interrupt(void)
diff --git a/arch/mips/kernel/smp-bmips.c b/arch/mips/kernel/smp-bmips.c
index f5d7bfa3472a..15466d4cf4a0 100644
--- a/arch/mips/kernel/smp-bmips.c
+++ b/arch/mips/kernel/smp-bmips.c
@@ -54,6 +54,8 @@ static void bmips_set_reset_vec(int cpu, u32 val);
#ifdef CONFIG_SMP
+#include <asm/smp.h>
+
/* initial $sp, $gp - used by arch/mips/kernel/bmips_vec.S */
unsigned long bmips_smp_boot_sp;
unsigned long bmips_smp_boot_gp;
@@ -413,6 +415,8 @@ void __ref play_dead(void)
" wait\n"
" j bmips_secondary_reentry\n"
: : : "memory");
+
+ BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index f968a319d87f..62f677b2306f 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -20,6 +20,7 @@
#include <asm/mipsregs.h>
#include <asm/pm-cps.h>
#include <asm/r4kcache.h>
+#include <asm/smp.h>
#include <asm/smp-cps.h>
#include <asm/time.h>
#include <asm/uasm.h>
diff --git a/arch/mips/kernel/uprobes.c b/arch/mips/kernel/uprobes.c
index 6c063aa188e6..2b1f375c407b 100644
--- a/arch/mips/kernel/uprobes.c
+++ b/arch/mips/kernel/uprobes.c
@@ -2,6 +2,7 @@
#include <linux/highmem.h>
#include <linux/kdebug.h>
#include <linux/types.h>
+#include <linux/non-atomic/xchg.h>
#include <linux/notifier.h>
#include <linux/sched.h>
#include <linux/uprobes.h>
@@ -197,14 +198,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *aup,
unsigned long arch_uretprobe_hijack_return_addr(
unsigned long trampoline_vaddr, struct pt_regs *regs)
{
- unsigned long ra;
-
- ra = regs->regs[31];
-
- /* Replace the return address with the trampoline address */
- regs->regs[31] = trampoline_vaddr;
-
- return ra;
+ return __xchg(&regs->regs[31], trampoline_vaddr);
}
/**
diff --git a/arch/mips/loongson64/smp.c b/arch/mips/loongson64/smp.c
index 90c783981197..b0e8bb9fa036 100644
--- a/arch/mips/loongson64/smp.c
+++ b/arch/mips/loongson64/smp.c
@@ -14,6 +14,7 @@
#include <linux/cpufreq.h>
#include <linux/kexec.h>
#include <asm/processor.h>
+#include <asm/smp.h>
#include <asm/time.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>
@@ -808,6 +809,7 @@ out:
state_addr = &per_cpu(cpu_state, cpu);
mb();
play_dead_at_ckseg1(state_addr);
+ BUG();
}
static int loongson3_disable_clock(unsigned int cpu)
diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index 18af9474ed0e..eb56581f6d73 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile
@@ -4,9 +4,7 @@
# Sanitizer runtimes are unavailable and cannot be linked here.
KCSAN_SANITIZE := n
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_MIPS_JUMP_SLOT|R_MIPS_GLOB_DAT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso-y := elf.o vgettimeofday.o sigreturn.o
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 0f5c2564e9f5..edd458518e0e 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -129,7 +129,7 @@ static inline pte_t pte_mkold(pte_t pte)
return pte;
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
pte_val(pte) |= _PAGE_WRITE;
return pte;
diff --git a/arch/openrisc/include/asm/cmpxchg.h b/arch/openrisc/include/asm/cmpxchg.h
index 79fd16162ccb..8ee151c072e4 100644
--- a/arch/openrisc/include/asm/cmpxchg.h
+++ b/arch/openrisc/include/asm/cmpxchg.h
@@ -147,8 +147,8 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
extern unsigned long __xchg_called_with_bad_pointer(void)
__compiletime_error("Bad argument size for xchg");
-static inline unsigned long __xchg(volatile void *ptr, unsigned long with,
- int size)
+static inline unsigned long
+__arch_xchg(volatile void *ptr, unsigned long with, int size)
{
switch (size) {
case 1:
@@ -163,9 +163,9 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long with,
#define arch_xchg(ptr, with) \
({ \
- (__typeof__(*(ptr))) __xchg((ptr), \
- (unsigned long)(with), \
- sizeof(*(ptr))); \
+ (__typeof__(*(ptr))) __arch_xchg((ptr), \
+ (unsigned long)(with), \
+ sizeof(*(ptr))); \
})
#endif /* __ASM_OPENRISC_CMPXCHG_H */
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 3eb9b9555d0d..fd40aec189d1 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -250,7 +250,7 @@ static inline pte_t pte_mkold(pte_t pte)
return pte;
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
pte_val(pte) |= _PAGE_WRITE;
return pte;
diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c
index e1419095a6f0..0a7a059e2dff 100644
--- a/arch/openrisc/kernel/smp.c
+++ b/arch/openrisc/kernel/smp.c
@@ -173,7 +173,7 @@ void handle_IPI(unsigned int ipi_msg)
}
}
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE);
}
diff --git a/arch/parisc/include/asm/cmpxchg.h b/arch/parisc/include/asm/cmpxchg.h
index 5f274be10567..c1d776bb16b4 100644
--- a/arch/parisc/include/asm/cmpxchg.h
+++ b/arch/parisc/include/asm/cmpxchg.h
@@ -22,7 +22,7 @@ extern unsigned long __xchg64(unsigned long, volatile unsigned long *);
/* optimizer better get rid of switch since size is a constant */
static inline unsigned long
-__xchg(unsigned long x, volatile void *ptr, int size)
+__arch_xchg(unsigned long x, volatile void *ptr, int size)
{
switch (size) {
#ifdef CONFIG_64BIT
@@ -49,7 +49,7 @@ __xchg(unsigned long x, volatile void *ptr, int size)
__typeof__(*(ptr)) __ret; \
__typeof__(*(ptr)) _x_ = (x); \
__ret = (__typeof__(*(ptr))) \
- __xchg((unsigned long)_x_, (ptr), sizeof(*(ptr))); \
+ __arch_xchg((unsigned long)_x_, (ptr), sizeof(*(ptr))); \
__ret; \
})
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index e2950f5db7c9..89f62137e67f 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -331,8 +331,12 @@ static inline pte_t pte_mkold(pte_t pte) { pte_val(pte) &= ~_PAGE_ACCESSED; retu
static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) &= ~_PAGE_WRITE; return pte; }
static inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= _PAGE_DIRTY; return pte; }
static inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= _PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) |= _PAGE_WRITE; return pte; }
static inline pte_t pte_mkspecial(pte_t pte) { pte_val(pte) |= _PAGE_SPECIAL; return pte; }
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+ pte_val(pte) |= _PAGE_WRITE;
+ return pte;
+}
/*
* Huge pte definitions.
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index c064719b49b0..97c6f875bd0e 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -159,7 +159,7 @@ EXPORT_SYMBOL(running_on_qemu);
/*
* Called from the idle thread for the CPU which has been shutdown.
*/
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
#ifdef CONFIG_HOTPLUG_CPU
idle_task_exit();
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 7dbd92cafae3..b7fc859fa87d 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -246,8 +246,8 @@ void kgdb_roundup_cpus(void)
inline void
smp_send_stop(void) { send_IPI_allbutself(IPI_CPU_STOP); }
-void
-smp_send_reschedule(int cpu) { send_IPI_single(cpu, IPI_RESCHEDULE); }
+void
+arch_smp_send_reschedule(int cpu) { send_IPI_single(cpu, IPI_RESCHEDULE); }
void
smp_send_all_nop(void)
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 7bf1fe7297c6..10d9a1d2aca9 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -498,7 +498,7 @@ static inline pte_t pte_mkpte(pte_t pte)
return pte;
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
return __pte(pte_val(pte) | _PAGE_RW);
}
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 4acc9690f599..be0636522d36 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -600,7 +600,7 @@ static inline pte_t pte_mkexec(pte_t pte)
return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC));
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
/*
* write implies read, hence set both
@@ -1071,7 +1071,7 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
-#define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkwrite(pmd, vma) pte_pmd(pte_mkwrite(pmd_pte(pmd), (vma)))
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#define pmd_soft_dirty(pmd) pte_soft_dirty(pmd_pte(pmd))
diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index d0ea0571e79a..dbb50c06f0bf 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -229,7 +229,7 @@ __xchg_local(void *ptr, unsigned long x, unsigned int size)
return __xchg_u64_local(ptr, x);
#endif
}
- BUILD_BUG_ON_MSG(1, "Unsupported size for __xchg");
+ BUILD_BUG_ON_MSG(1, "Unsupported size for __xchg_local");
return x;
}
@@ -248,7 +248,7 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
return __xchg_u64_relaxed(ptr, x);
#endif
}
- BUILD_BUG_ON_MSG(1, "Unsupported size for __xchg_local");
+ BUILD_BUG_ON_MSG(1, "Unsupported size for __xchg_relaxed");
return x;
}
#define arch_xchg_local(ptr,x) \
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index fec56d965f00..7bfbcb9ba55b 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -171,7 +171,7 @@ void unmap_kernel_page(unsigned long va);
do { pte_update(mm, addr, ptep, ~0, 0, 0); } while (0)
#ifndef pte_mkwrite
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
return __pte(pte_val(pte) | _PAGE_RW);
}
diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 1a89ebdc3acc..f32450eb270a 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -101,7 +101,7 @@ static inline int pte_write(pte_t pte)
#define pte_write pte_write
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
return __pte(pte_val(pte) & ~_PAGE_RO);
}
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index 287e25864ffa..589009555877 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -85,7 +85,7 @@
#ifndef __ASSEMBLY__
/* pte_clear moved to later in this file */
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
return __pte(pte_val(pte) | _PAGE_RW);
}
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 6c6cb53d7045..aaaa576d0e15 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -67,7 +67,7 @@ void start_secondary(void *unused);
extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us);
extern int smp_send_safe_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us);
extern void smp_send_debugger_break(void);
-extern void start_secondary_resume(void);
+extern void __noreturn start_secondary_resume(void);
extern void smp_generic_give_timebase(void);
extern void smp_generic_take_timebase(void);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 7db6b3faea65..265801a3e94c 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -61,6 +61,8 @@
#include <asm/kup.h>
#include <asm/fadump.h>
+#include <trace/events/ipi.h>
+
#ifdef DEBUG
#include <asm/udbg.h>
#define DBG(fmt...) udbg_printf(fmt)
@@ -364,12 +366,12 @@ static inline void do_message_pass(int cpu, int msg)
#endif
}
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
if (likely(smp_ops))
do_message_pass(cpu, PPC_MSG_RESCHEDULE);
}
-EXPORT_SYMBOL_GPL(smp_send_reschedule);
+EXPORT_SYMBOL_GPL(arch_smp_send_reschedule);
void arch_send_call_function_single_ipi(int cpu)
{
@@ -1752,7 +1754,7 @@ void __cpu_die(unsigned int cpu)
smp_ops->cpu_die(cpu);
}
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
/*
* Disable on the down path. This will be re-enabled by
diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
index 95a41ae9dfa7..3c15c320315e 100644
--- a/arch/powerpc/kernel/uprobes.c
+++ b/arch/powerpc/kernel/uprobes.c
@@ -7,6 +7,7 @@
* Adapted from the x86 port by Ananth N Mavinakayanahalli <ananth@in.ibm.com>
*/
#include <linux/kernel.h>
+#include <linux/non-atomic/xchg.h>
#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/uprobes.h>
@@ -197,14 +198,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
{
- unsigned long orig_ret_vaddr;
-
- orig_ret_vaddr = regs->link;
-
- /* Replace the return addr with trampoline addr */
- regs->link = trampoline_vaddr;
-
- return orig_ret_vaddr;
+ return __xchg(&regs->link, trampoline_vaddr);
}
bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile
index 66f723f53be2..4c3f34485f08 100644
--- a/arch/powerpc/kernel/vdso/Makefile
+++ b/arch/powerpc/kernel/vdso/Makefile
@@ -2,7 +2,7 @@
# List of files in the vdso, has to be asm only for now
-ARCH_REL_TYPE_ABS := R_PPC_JUMP_SLOT|R_PPC_GLOB_DAT|R_PPC_ADDR32|R_PPC_ADDR24|R_PPC_ADDR16|R_PPC_ADDR16_LO|R_PPC_ADDR16_HI|R_PPC_ADDR16_HA|R_PPC_ADDR14|R_PPC_ADDR14_BRTAKEN|R_PPC_ADDR14_BRNTAKEN|R_PPC_REL24
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso32 = sigtramp32-32.o gettimeofday-32.o datapage-32.o cacheflush-32.o note-32.o getcpu-32.o
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index c973bf556fb3..36b295e908ca 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -43,6 +43,7 @@
#include <linux/compiler.h>
#include <linux/of.h>
#include <linux/irqdomain.h>
+#include <linux/smp.h>
#include <asm/ftrace.h>
#include <asm/reg.h>
@@ -80,6 +81,8 @@
#include <asm/dtl.h>
#include <asm/plpar_wrappers.h>
+#include <trace/events/ipi.h>
+
#include "book3s.h"
#include "book3s_hv.h"
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
index 7e98b00ea2e8..c53c4c797768 100644
--- a/arch/powerpc/platforms/powernv/subcore.c
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -20,6 +20,8 @@
#include <asm/opal.h>
#include <asm/smp.h>
+#include <trace/events/ipi.h>
+
#include "subcore.h"
#include "powernv.h"
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index 0dfe9d857a76..bba472928b53 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -261,7 +261,7 @@ c_t arch_atomic##prefix##_xchg_release(atomic##prefix##_t *v, c_t n) \
static __always_inline \
c_t arch_atomic##prefix##_xchg(atomic##prefix##_t *v, c_t n) \
{ \
- return __xchg(&(v->counter), n, size); \
+ return __arch_xchg(&(v->counter), n, size); \
} \
static __always_inline \
c_t arch_atomic##prefix##_cmpxchg_relaxed(atomic##prefix##_t *v, \
diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 12debce235e5..2f4726d3cfcc 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -114,7 +114,7 @@
_x_, sizeof(*(ptr))); \
})
-#define __xchg(ptr, new, size) \
+#define __arch_xchg(ptr, new, size) \
({ \
__typeof__(ptr) __ptr = (ptr); \
__typeof__(new) __new = (new); \
@@ -143,7 +143,7 @@
#define arch_xchg(ptr, x) \
({ \
__typeof__(*(ptr)) _x_ = (x); \
- (__typeof__(*(ptr))) __xchg((ptr), _x_, sizeof(*(ptr))); \
+ (__typeof__(*(ptr))) __arch_xchg((ptr), _x_, sizeof(*(ptr))); \
})
#define xchg32(ptr, x) \
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 3303fc03d724..9a207567d31e 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -375,7 +375,7 @@ static inline pte_t pte_wrprotect(pte_t pte)
/* static inline pte_t pte_mkread(pte_t pte) */
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
return __pte(pte_val(pte) | _PAGE_WRITE);
}
@@ -661,9 +661,9 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
return pte_pmd(pte_mkyoung(pmd_pte(pmd)));
}
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
- return pte_pmd(pte_mkwrite(pmd_pte(pmd)));
+ return pte_pmd(pte_mkwrite(pmd_pte(pmd), vma));
}
static inline pmd_t pmd_wrprotect(pmd_t pmd)
diff --git a/arch/riscv/kernel/cpu-hotplug.c b/arch/riscv/kernel/cpu-hotplug.c
index f7a832e3a1d1..59b80211c25f 100644
--- a/arch/riscv/kernel/cpu-hotplug.c
+++ b/arch/riscv/kernel/cpu-hotplug.c
@@ -71,7 +71,7 @@ void __cpu_die(unsigned int cpu)
/*
* Called from the idle thread for the CPU which has been shutdown.
*/
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
idle_task_exit();
diff --git a/arch/riscv/kernel/probes/uprobes.c b/arch/riscv/kernel/probes/uprobes.c
index c976a21cd4bd..5c8415e21653 100644
--- a/arch/riscv/kernel/probes/uprobes.c
+++ b/arch/riscv/kernel/probes/uprobes.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/highmem.h>
+#include <linux/non-atomic/xchg.h>
#include <linux/ptrace.h>
#include <linux/uprobes.h>
@@ -122,13 +123,7 @@ unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr,
struct pt_regs *regs)
{
- unsigned long ra;
-
- ra = regs->ra;
-
- regs->ra = trampoline_vaddr;
-
- return ra;
+ return __xchg(&regs->ra, trampoline_vaddr);
}
int arch_uprobe_exception_notify(struct notifier_block *self,
diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c
index 8c3b59f1f9b8..42e9656a1db2 100644
--- a/arch/riscv/kernel/smp.c
+++ b/arch/riscv/kernel/smp.c
@@ -328,8 +328,8 @@ bool smp_crash_stop_failed(void)
}
#endif
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
send_ipi_single(cpu, IPI_RESCHEDULE);
}
-EXPORT_SYMBOL_GPL(smp_send_reschedule);
+EXPORT_SYMBOL_GPL(arch_smp_send_reschedule);
diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile
index 06e6b27f3bcc..a04b3bc35ca2 100644
--- a/arch/riscv/kernel/vdso/Makefile
+++ b/arch/riscv/kernel/vdso/Makefile
@@ -1,9 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
# Copied from arch/tile/kernel/vdso/Makefile
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_RISCV_32|R_RISCV_64|R_RISCV_JUMP_SLOT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
# Symbols present in the vdso
vdso-syms = rt_sigreturn
diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h
index 3f26416c2ad8..06e0e42f4eec 100644
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -14,8 +14,8 @@
void __xchg_called_with_bad_pointer(void);
-static __always_inline unsigned long __xchg(unsigned long x,
- unsigned long address, int size)
+static __always_inline unsigned long
+__arch_xchg(unsigned long x, unsigned long address, int size)
{
unsigned long old;
int shift;
@@ -77,8 +77,8 @@ static __always_inline unsigned long __xchg(unsigned long x,
__typeof__(*(ptr)) __ret; \
\
__ret = (__typeof__(*(ptr))) \
- __xchg((unsigned long)(x), (unsigned long)(ptr), \
- sizeof(*(ptr))); \
+ __arch_xchg((unsigned long)(x), (unsigned long)(ptr), \
+ sizeof(*(ptr))); \
__ret; \
})
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index ccdbccfde148..558f7eef9c4d 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -102,9 +102,9 @@ static inline int huge_pte_dirty(pte_t pte)
return pte_dirty(pte);
}
-static inline pte_t huge_pte_mkwrite(pte_t pte)
+static inline pte_t huge_pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
- return pte_mkwrite(pte);
+ return pte_mkwrite(pte, vma);
}
static inline pte_t huge_pte_mkdirty(pte_t pte)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index c1f6b46ec555..2d2d970e8f5e 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1005,7 +1005,7 @@ static inline pte_t pte_wrprotect(pte_t pte)
return set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite_kernel(pte_t pte)
{
pte = set_pte_bit(pte, __pgprot(_PAGE_WRITE));
if (pte_val(pte) & _PAGE_DIRTY)
@@ -1013,6 +1013,11 @@ static inline pte_t pte_mkwrite(pte_t pte)
return pte;
}
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+ return pte_mkwrite_kernel(pte);
+}
+
static inline pte_t pte_mkclean(pte_t pte)
{
pte = clear_pte_bit(pte, __pgprot(_PAGE_DIRTY));
@@ -1488,7 +1493,7 @@ static inline pmd_t pmd_wrprotect(pmd_t pmd)
return set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_PROTECT));
}
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkwrite_kernel(pmd_t pmd)
{
pmd = set_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_WRITE));
if (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY)
@@ -1496,6 +1501,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
return pmd;
}
+static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+ return pmd_mkwrite_kernel(pmd);
+}
+
static inline pmd_t pmd_mkclean(pmd_t pmd)
{
pmd = clear_pmd_bit(pmd, __pgprot(_SEGMENT_ENTRY_DIRTY));
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 38e267c7bff7..e7239aaf428b 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -88,7 +88,7 @@ void arch_cpu_idle_exit(void)
{
}
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
cpu_die();
}
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 27c710178033..880b1077e0f3 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -551,7 +551,7 @@ void arch_send_call_function_single_ipi(int cpu)
* it goes straight through and wastes no time serializing
* anything. Worst case is that we lose a reschedule ...
*/
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
pcpu_ec_call(pcpu_devices + cpu, ec_schedule);
}
diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c
index b88345ef8bd9..18591ca40ae7 100644
--- a/arch/s390/kernel/uprobes.c
+++ b/arch/s390/kernel/uprobes.c
@@ -11,6 +11,7 @@
#include <linux/compat.h>
#include <linux/kdebug.h>
#include <linux/sched/task_stack.h>
+#include <linux/non-atomic/xchg.h>
#include <asm/switch_to.h>
#include <asm/facility.h>
@@ -144,11 +145,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline,
struct pt_regs *regs)
{
- unsigned long orig;
-
- orig = regs->gprs[14];
- regs->gprs[14] = trampoline;
- return orig;
+ return __xchg(&regs->gprs[14], trampoline);
}
bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
index 245bddfe9bc0..bafd3147eb4e 100644
--- a/arch/s390/kernel/vdso32/Makefile
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -2,9 +2,8 @@
# List of files in the vdso
KCOV_INSTRUMENT := n
-ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE
-ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso32 = vdso_user_wrapper-32.o note-32.o
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index 34f9542636e9..a766d286e15f 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -2,9 +2,8 @@
# List of files in the vdso
KCOV_INSTRUMENT := n
-ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE
-ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
obj-vdso64 = vdso_user_wrapper.o note.o
obj-cvdso64 = vdso64_generic.o getcpu.o
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 7838e9c70000..8a28e5da1e5a 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -96,7 +96,7 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (flags & SET_MEMORY_RO)
new = pte_wrprotect(new);
else if (flags & SET_MEMORY_RW)
- new = pte_mkwrite(pte_mkdirty(new));
+ new = pte_mkwrite_kernel(pte_mkdirty(new));
if (flags & SET_MEMORY_NX)
new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC));
else if (flags & SET_MEMORY_X)
@@ -146,7 +146,7 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
if (flags & SET_MEMORY_RO)
new = pmd_wrprotect(new);
else if (flags & SET_MEMORY_RW)
- new = pmd_mkwrite(pmd_mkdirty(new));
+ new = pmd_mkwrite_kernel(pmd_mkdirty(new));
if (flags & SET_MEMORY_NX)
new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
else if (flags & SET_MEMORY_X)
diff --git a/arch/sh/include/asm/cmpxchg.h b/arch/sh/include/asm/cmpxchg.h
index 0ed9b3f4a577..288f6f38d98f 100644
--- a/arch/sh/include/asm/cmpxchg.h
+++ b/arch/sh/include/asm/cmpxchg.h
@@ -22,7 +22,7 @@
extern void __xchg_called_with_bad_pointer(void);
-#define __xchg(ptr, x, size) \
+#define __arch_xchg(ptr, x, size) \
({ \
unsigned long __xchg__res; \
volatile void *__xchg_ptr = (ptr); \
@@ -46,7 +46,7 @@ extern void __xchg_called_with_bad_pointer(void);
})
#define arch_xchg(ptr,x) \
- ((__typeof__(*(ptr)))__xchg((ptr),(unsigned long)(x), sizeof(*(ptr))))
+ ((__typeof__(*(ptr)))__arch_xchg((ptr),(unsigned long)(x), sizeof(*(ptr))))
/* This function doesn't exist, so you'll get a linker error
* if something tries to do an invalid cmpxchg(). */
diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h
index 21952b094650..9f2dcb9eafc8 100644
--- a/arch/sh/include/asm/pgtable_32.h
+++ b/arch/sh/include/asm/pgtable_32.h
@@ -351,6 +351,12 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
#define PTE_BIT_FUNC(h,fn,op) \
static inline pte_t pte_##fn(pte_t pte) { pte.pte_##h op; return pte; }
+#define PTE_BIT_FUNC_VMA(h,fn,op) \
+static inline pte_t pte_##fn(pte_t pte, struct vm_area_struct *vma) \
+{ \
+ pte.pte_##h op; \
+ return pte; \
+}
#ifdef CONFIG_X2TLB
/*
@@ -359,11 +365,11 @@ static inline pte_t pte_##fn(pte_t pte) { pte.pte_##h op; return pte; }
* kernel permissions), we attempt to couple them a bit more sanely here.
*/
PTE_BIT_FUNC(high, wrprotect, &= ~(_PAGE_EXT_USER_WRITE | _PAGE_EXT_KERN_WRITE));
-PTE_BIT_FUNC(high, mkwrite, |= _PAGE_EXT_USER_WRITE | _PAGE_EXT_KERN_WRITE);
+PTE_BIT_FUNC_VMA(high, mkwrite, |= _PAGE_EXT_USER_WRITE | _PAGE_EXT_KERN_WRITE);
PTE_BIT_FUNC(high, mkhuge, |= _PAGE_SZHUGE);
#else
PTE_BIT_FUNC(low, wrprotect, &= ~_PAGE_RW);
-PTE_BIT_FUNC(low, mkwrite, |= _PAGE_RW);
+PTE_BIT_FUNC_VMA(low, mkwrite, |= _PAGE_RW);
PTE_BIT_FUNC(low, mkhuge, |= _PAGE_SZHUGE);
#endif
diff --git a/arch/sh/include/asm/smp-ops.h b/arch/sh/include/asm/smp-ops.h
index e27702130eb6..97331fcb7b85 100644
--- a/arch/sh/include/asm/smp-ops.h
+++ b/arch/sh/include/asm/smp-ops.h
@@ -24,9 +24,10 @@ static inline void plat_smp_setup(void)
mp_ops->smp_setup();
}
-static inline void play_dead(void)
+static inline void __noreturn play_dead(void)
{
mp_ops->play_dead();
+ BUG();
}
extern void register_smp_ops(struct plat_smp_ops *ops);
@@ -42,7 +43,7 @@ static inline void register_smp_ops(struct plat_smp_ops *ops)
{
}
-static inline void play_dead(void)
+static inline void __noreturn play_dead(void)
{
BUG();
}
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 3418c40f0099..d662503b0665 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2002 - 2009 Paul Mundt
*/
+#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -29,7 +30,7 @@ void default_idle(void)
clear_bl_bit();
}
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
play_dead();
}
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 65924d9ec245..5cf35a774dc7 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -256,7 +256,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
(bogosum / (5000/HZ)) % 100);
}
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
mp_ops->send_ipi(cpu, SMP_MSG_RESCHEDULE);
}
diff --git a/arch/sparc/include/asm/cmpxchg_32.h b/arch/sparc/include/asm/cmpxchg_32.h
index 27a57a3a7597..7a1339533d1d 100644
--- a/arch/sparc/include/asm/cmpxchg_32.h
+++ b/arch/sparc/include/asm/cmpxchg_32.h
@@ -15,7 +15,7 @@
unsigned long __xchg_u32(volatile u32 *m, u32 new);
void __xchg_called_with_bad_pointer(void);
-static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, int size)
+static inline unsigned long __arch_xchg(unsigned long x, __volatile__ void * ptr, int size)
{
switch (size) {
case 4:
@@ -25,7 +25,7 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, int
return x;
}
-#define arch_xchg(ptr,x) ({(__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)));})
+#define arch_xchg(ptr,x) ({(__typeof__(*(ptr)))__arch_xchg((unsigned long)(x),(ptr),sizeof(*(ptr)));})
/* Emulate cmpxchg() the same way we emulate atomics,
* by hashing the object address and indexing into an array
diff --git a/arch/sparc/include/asm/cmpxchg_64.h b/arch/sparc/include/asm/cmpxchg_64.h
index 12d00a42c0a3..66cd61dde9ec 100644
--- a/arch/sparc/include/asm/cmpxchg_64.h
+++ b/arch/sparc/include/asm/cmpxchg_64.h
@@ -55,7 +55,7 @@ static inline unsigned long xchg64(__volatile__ unsigned long *m, unsigned long
#define arch_xchg(ptr,x) \
({ __typeof__(*(ptr)) __ret; \
__ret = (__typeof__(*(ptr))) \
- __xchg((unsigned long)(x), (ptr), sizeof(*(ptr))); \
+ __arch_xchg((unsigned long)(x), (ptr), sizeof(*(ptr))); \
__ret; \
})
@@ -87,8 +87,8 @@ xchg16(__volatile__ unsigned short *m, unsigned short val)
return (load32 & mask) >> bit_shift;
}
-static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
- int size)
+static inline unsigned long
+__arch_xchg(unsigned long x, __volatile__ void * ptr, int size)
{
switch (size) {
case 2:
diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h
index 7a8380c63aab..799e797c5cdd 100644
--- a/arch/sparc/include/asm/mmu_context_64.h
+++ b/arch/sparc/include/asm/mmu_context_64.h
@@ -185,6 +185,12 @@ static inline void finish_arch_post_lock_switch(void)
}
}
+#define mm_untag_mask mm_untag_mask
+static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+{
+ return -1UL >> adi_nbits();
+}
+
#include <asm-generic/mmu_context.h>
#endif /* !(__ASSEMBLY__) */
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index d4330e3c57a6..3e8836179456 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -241,7 +241,7 @@ static inline pte_t pte_mkold(pte_t pte)
return __pte(pte_val(pte) & ~SRMMU_REF);
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
return __pte(pte_val(pte) | SRMMU_WRITE);
}
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 2dc8d4641734..c5cd5c03f557 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -466,7 +466,7 @@ static inline pte_t pte_mkclean(pte_t pte)
return __pte(val);
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
unsigned long val = pte_val(pte), mask;
@@ -756,11 +756,11 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
return __pmd(pte_val(pte));
}
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
+static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
pte_t pte = __pte(pmd_val(pmd));
- pte = pte_mkwrite(pte);
+ pte = pte_mkwrite(pte, vma);
return __pmd(pte_val(pte));
}
diff --git a/arch/sparc/include/asm/smp_64.h b/arch/sparc/include/asm/smp_64.h
index e75783b6abc4..505b6700805d 100644
--- a/arch/sparc/include/asm/smp_64.h
+++ b/arch/sparc/include/asm/smp_64.h
@@ -49,7 +49,7 @@ int hard_smp_processor_id(void);
void smp_fill_in_cpu_possible_map(void);
void smp_fill_in_sib_core_maps(void);
-void cpu_play_dead(void);
+void __noreturn cpu_play_dead(void);
void smp_fetch_global_regs(void);
void smp_fetch_global_pmu(void);
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index 94266a5c5b04..b825a5dd0210 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -8,8 +8,10 @@
#include <linux/compiler.h>
#include <linux/string.h>
+#include <linux/mm_types.h>
#include <asm/asi.h>
#include <asm/spitfire.h>
+#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm-generic/access_ok.h>
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 91c2b8124527..b51d8fb0ecdc 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -95,7 +95,7 @@ void arch_cpu_idle(void)
}
#ifdef CONFIG_HOTPLUG_CPU
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
sched_preempt_enable_no_resched();
cpu_play_dead();
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index dad38960d1a8..82da8a2d769d 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -751,7 +751,7 @@ out:
*/
static_assert(NSIGILL == 11);
static_assert(NSIGFPE == 15);
-static_assert(NSIGSEGV == 9);
+static_assert(NSIGSEGV == 10);
static_assert(NSIGBUS == 5);
static_assert(NSIGTRAP == 6);
static_assert(NSIGCHLD == 6);
diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index 570e43e6fda5..b4e410976e0d 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -562,7 +562,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long
*/
static_assert(NSIGILL == 11);
static_assert(NSIGFPE == 15);
-static_assert(NSIGSEGV == 9);
+static_assert(NSIGSEGV == 10);
static_assert(NSIGBUS == 5);
static_assert(NSIGTRAP == 6);
static_assert(NSIGCHLD == 6);
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index ad8094d955eb..87eaa7719fa2 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -120,7 +120,7 @@ void cpu_panic(void)
struct linux_prom_registers smp_penguin_ctable = { 0 };
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
/*
* CPU model dependent way of implementing IPI generation targeting
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index a55295d1b924..e5964d1d8b37 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1430,7 +1430,7 @@ static unsigned long send_cpu_poke(int cpu)
return hv_err;
}
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
if (cpu == smp_processor_id()) {
WARN_ON_ONCE(preemptible());
diff --git a/arch/sparc/kernel/uprobes.c b/arch/sparc/kernel/uprobes.c
index 1a0600206bf5..1c02653d6de3 100644
--- a/arch/sparc/kernel/uprobes.c
+++ b/arch/sparc/kernel/uprobes.c
@@ -11,6 +11,7 @@
#include <linux/kernel.h>
#include <linux/highmem.h>
+#include <linux/non-atomic/xchg.h>
#include <linux/uprobes.h>
#include <linux/uaccess.h>
#include <linux/sched.h> /* For struct task_struct */
@@ -310,9 +311,5 @@ unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr,
struct pt_regs *regs)
{
- unsigned long orig_ret_vaddr = regs->u_regs[UREG_I7];
-
- regs->u_regs[UREG_I7] = trampoline_vaddr-8;
-
- return orig_ret_vaddr + 8;
+ return __xchg(&regs->u_regs[UREG_I7], trampoline_vaddr - 8) + 8;
}
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index a70d1618eb35..963479c133b7 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -207,7 +207,7 @@ static inline pte_t pte_mkyoung(pte_t pte)
return(pte);
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
if (unlikely(pte_get_bits(pte, _PAGE_RW)))
return pte;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b54ca1a77d22..5254225aea73 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1854,6 +1854,11 @@ config CC_HAS_IBT
(CC_IS_CLANG && CLANG_VERSION >= 140000)) && \
$(as-instr,endbr64)
+config X86_CET
+ def_bool n
+ help
+ CET features configured (Shadow stack or IBT)
+
config X86_KERNEL_IBT
prompt "Indirect Branch Tracking"
def_bool y
@@ -1861,6 +1866,7 @@ config X86_KERNEL_IBT
# https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f
depends on !LD_IS_LLD || LLD_VERSION >= 140000
select OBJTOOL
+ select X86_CET
help
Build the kernel with support for Indirect Branch Tracking, a
hardware support course-grain forward-edge Control Flow Integrity
@@ -1955,6 +1961,24 @@ config X86_SGX
If unsure, say N.
+config X86_USER_SHADOW_STACK
+ bool "X86 userspace shadow stack"
+ depends on AS_WRUSS
+ depends on X86_64
+ select ARCH_USES_HIGH_VMA_FLAGS
+ select X86_CET
+ help
+ Shadow stack protection is a hardware feature that detects function
+ return address corruption. This helps mitigate ROP attacks.
+ Applications must be enabled to use it, and old userspace does not
+ get protection "for free".
+
+ CPUs supporting shadow stacks were first released in 2020.
+
+ See Documentation/x86/shstk.rst for more information.
+
+ If unsure, say N.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
@@ -2293,6 +2317,17 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING
If unsure, leave at the default value.
+config ADDRESS_MASKING
+ bool "Linear Address Masking support"
+ depends on X86_64
+ help
+ Linear Address Masking (LAM) modifies the checking that is applied
+ to 64-bit linear addresses, allowing software to use of the
+ untranslated address bits for metadata.
+
+ The capability can be used for efficient address sanitizers (ASAN)
+ implementation and for optimizations in JITs.
+
config HOTPLUG_CPU
def_bool y
depends on SMP
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
index b88f784cb02e..8ad41da301e5 100644
--- a/arch/x86/Kconfig.assembler
+++ b/arch/x86/Kconfig.assembler
@@ -24,3 +24,8 @@ config AS_GFNI
def_bool $(as-instr,vgf2p8mulb %xmm0$(comma)%xmm1$(comma)%xmm2)
help
Supported by binutils >= 2.30 and LLVM integrated assembler
+
+config AS_WRUSS
+ def_bool $(as-instr,wrussq %rax$(comma)(%rbx))
+ help
+ Supported by binutils >= 2.31 and LLVM integrated assembler
diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c
index 918a7606f53c..2d81d3cc72a1 100644
--- a/arch/x86/boot/compressed/tdx.c
+++ b/arch/x86/boot/compressed/tdx.c
@@ -26,7 +26,7 @@ static inline unsigned int tdx_io_in(int size, u16 port)
.r14 = port,
};
- if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ if (__tdx_hypercall_ret(&args))
return UINT_MAX;
return args.r11;
@@ -43,7 +43,7 @@ static inline void tdx_io_out(int size, u16 port, u32 value)
.r15 = value,
};
- __tdx_hypercall(&args, 0);
+ __tdx_hypercall(&args);
}
static inline u8 tdx_inb(u16 port)
diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c
index 49b44f881484..73f83233d25d 100644
--- a/arch/x86/coco/core.c
+++ b/arch/x86/coco/core.c
@@ -13,7 +13,7 @@
#include <asm/coco.h>
#include <asm/processor.h>
-static enum cc_vendor vendor __ro_after_init;
+enum cc_vendor cc_vendor __ro_after_init;
static u64 cc_mask __ro_after_init;
static bool intel_cc_platform_has(enum cc_attr attr)
@@ -30,6 +30,22 @@ static bool intel_cc_platform_has(enum cc_attr attr)
}
/*
+ * Handle the SEV-SNP vTOM case where sme_me_mask is zero, and
+ * the other levels of SME/SEV functionality, including C-bit
+ * based SEV-SNP, are not enabled.
+ */
+static __maybe_unused bool amd_cc_platform_vtom(enum cc_attr attr)
+{
+ switch (attr) {
+ case CC_ATTR_GUEST_MEM_ENCRYPT:
+ case CC_ATTR_MEM_ENCRYPT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
* SME and SEV are very similar but they are not the same, so there are
* times that the kernel will need to distinguish between SME and SEV. The
* cc_platform_has() function is used for this. When a distinction isn't
@@ -41,9 +57,14 @@ static bool intel_cc_platform_has(enum cc_attr attr)
* up under SME the trampoline area cannot be encrypted, whereas under SEV
* the trampoline area must be encrypted.
*/
+
static bool amd_cc_platform_has(enum cc_attr attr)
{
#ifdef CONFIG_AMD_MEM_ENCRYPT
+
+ if (sev_status & MSR_AMD64_SNP_VTOM)
+ return amd_cc_platform_vtom(attr);
+
switch (attr) {
case CC_ATTR_MEM_ENCRYPT:
return sme_me_mask;
@@ -76,20 +97,13 @@ static bool amd_cc_platform_has(enum cc_attr attr)
#endif
}
-static bool hyperv_cc_platform_has(enum cc_attr attr)
-{
- return attr == CC_ATTR_GUEST_MEM_ENCRYPT;
-}
-
bool cc_platform_has(enum cc_attr attr)
{
- switch (vendor) {
+ switch (cc_vendor) {
case CC_VENDOR_AMD:
return amd_cc_platform_has(attr);
case CC_VENDOR_INTEL:
return intel_cc_platform_has(attr);
- case CC_VENDOR_HYPERV:
- return hyperv_cc_platform_has(attr);
default:
return false;
}
@@ -103,11 +117,14 @@ u64 cc_mkenc(u64 val)
* encryption status of the page.
*
* - for AMD, bit *set* means the page is encrypted
- * - for Intel *clear* means encrypted.
+ * - for AMD with vTOM and for Intel, *clear* means encrypted
*/
- switch (vendor) {
+ switch (cc_vendor) {
case CC_VENDOR_AMD:
- return val | cc_mask;
+ if (sev_status & MSR_AMD64_SNP_VTOM)
+ return val & ~cc_mask;
+ else
+ return val | cc_mask;
case CC_VENDOR_INTEL:
return val & ~cc_mask;
default:
@@ -118,9 +135,12 @@ u64 cc_mkenc(u64 val)
u64 cc_mkdec(u64 val)
{
/* See comment in cc_mkenc() */
- switch (vendor) {
+ switch (cc_vendor) {
case CC_VENDOR_AMD:
- return val & ~cc_mask;
+ if (sev_status & MSR_AMD64_SNP_VTOM)
+ return val | cc_mask;
+ else
+ return val & ~cc_mask;
case CC_VENDOR_INTEL:
return val | cc_mask;
default:
@@ -129,11 +149,6 @@ u64 cc_mkdec(u64 val)
}
EXPORT_SYMBOL_GPL(cc_mkdec);
-__init void cc_set_vendor(enum cc_vendor v)
-{
- vendor = v;
-}
-
__init void cc_set_mask(u64 mask)
{
cc_mask = mask;
diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S
index 6a255e6809bc..b193c0a1d8db 100644
--- a/arch/x86/coco/tdx/tdcall.S
+++ b/arch/x86/coco/tdx/tdcall.S
@@ -85,12 +85,12 @@ SYM_FUNC_START(__tdx_module_call)
SYM_FUNC_END(__tdx_module_call)
/*
- * __tdx_hypercall() - Make hypercalls to a TDX VMM using TDVMCALL leaf
- * of TDCALL instruction
+ * TDX_HYPERCALL - Make hypercalls to a TDX VMM using TDVMCALL leaf of TDCALL
+ * instruction
*
* Transforms values in function call argument struct tdx_hypercall_args @args
* into the TDCALL register ABI. After TDCALL operation, VMM output is saved
- * back in @args.
+ * back in @args, if \ret is 1.
*
*-------------------------------------------------------------------------
* TD VMCALL ABI:
@@ -105,26 +105,18 @@ SYM_FUNC_END(__tdx_module_call)
* specification. Non zero value indicates vendor
* specific ABI.
* R11 - VMCALL sub function number
- * RBX, RBP, RDI, RSI - Used to pass VMCALL sub function specific arguments.
+ * RBX, RDX, RDI, RSI - Used to pass VMCALL sub function specific arguments.
* R8-R9, R12-R15 - Same as above.
*
* Output Registers:
*
* RAX - TDCALL instruction status (Not related to hypercall
* output).
- * R10 - Hypercall output error code.
- * R11-R15 - Hypercall sub function specific output values.
+ * RBX, RDX, RDI, RSI - Hypercall sub function specific output values.
+ * R8-R15 - Same as above.
*
- *-------------------------------------------------------------------------
- *
- * __tdx_hypercall() function ABI:
- *
- * @args (RDI) - struct tdx_hypercall_args for input and output
- * @flags (RSI) - TDX_HCALL_* flags
- *
- * On successful completion, return the hypercall error code.
*/
-SYM_FUNC_START(__tdx_hypercall)
+.macro TDX_HYPERCALL ret:req
FRAME_BEGIN
/* Save callee-saved GPRs as mandated by the x86_64 ABI */
@@ -134,9 +126,8 @@ SYM_FUNC_START(__tdx_hypercall)
push %r12
push %rbx
- /* Free RDI and RSI to be used as TDVMCALL arguments */
+ /* Free RDI to be used as TDVMCALL arguments */
movq %rdi, %rax
- push %rsi
/* Copy hypercall registers from arg struct: */
movq TDX_HYPERCALL_r8(%rax), %r8
@@ -171,14 +162,11 @@ SYM_FUNC_START(__tdx_hypercall)
* and are handled by callers.
*/
testq %rax, %rax
- jne .Lpanic
+ jne .Lpanic\@
pop %rax
- /* Copy hypercall result registers to arg struct if needed */
- testq $TDX_HCALL_HAS_OUTPUT, (%rsp)
- jz .Lout
-
+ .if \ret
movq %r8, TDX_HYPERCALL_r8(%rax)
movq %r9, TDX_HYPERCALL_r9(%rax)
movq %r10, TDX_HYPERCALL_r10(%rax)
@@ -191,7 +179,8 @@ SYM_FUNC_START(__tdx_hypercall)
movq %rsi, TDX_HYPERCALL_rsi(%rax)
movq %rbx, TDX_HYPERCALL_rbx(%rax)
movq %rdx, TDX_HYPERCALL_rdx(%rax)
-.Lout:
+ .endif
+
/* TDVMCALL leaf return code is in R10 */
movq %r10, %rax
@@ -208,9 +197,6 @@ SYM_FUNC_START(__tdx_hypercall)
xor %rdi, %rdi
xor %rdx, %rdx
- /* Remove TDX_HCALL_* flags from the stack */
- pop %rsi
-
/* Restore callee-saved GPRs as mandated by the x86_64 ABI */
pop %rbx
pop %r12
@@ -221,9 +207,33 @@ SYM_FUNC_START(__tdx_hypercall)
FRAME_END
RET
-.Lpanic:
+.Lpanic\@:
call __tdx_hypercall_failed
/* __tdx_hypercall_failed never returns */
REACHABLE
- jmp .Lpanic
+ jmp .Lpanic\@
+.endm
+
+/*
+ *
+ * __tdx_hypercall() function ABI:
+ *
+ * @args (RDI) - struct tdx_hypercall_args for input
+ *
+ * On successful completion, return the hypercall error code.
+ */
+SYM_FUNC_START(__tdx_hypercall)
+ TDX_HYPERCALL ret=0
SYM_FUNC_END(__tdx_hypercall)
+
+/*
+ *
+ * __tdx_hypercall_ret() function ABI:
+ *
+ * @args (RDI) - struct tdx_hypercall_args for input and output
+ *
+ * On successful completion, return the hypercall error code.
+ */
+SYM_FUNC_START(__tdx_hypercall_ret)
+ TDX_HYPERCALL ret=1
+SYM_FUNC_END(__tdx_hypercall_ret)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 055300e08fb3..e146b599260f 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -66,7 +66,7 @@ static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15)
.r15 = r15,
};
- return __tdx_hypercall(&args, 0);
+ return __tdx_hypercall(&args);
}
/* Called from __tdx_hypercall() for unrecoverable failure */
@@ -99,7 +99,7 @@ long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2,
.r14 = p4,
};
- return __tdx_hypercall(&args, 0);
+ return __tdx_hypercall(&args);
}
EXPORT_SYMBOL_GPL(tdx_kvm_hypercall);
#endif
@@ -179,7 +179,7 @@ static void __noreturn tdx_panic(const char *msg)
* happens to return.
*/
while (1)
- __tdx_hypercall(&args, 0);
+ __tdx_hypercall(&args);
}
static void tdx_parse_tdinfo(u64 *cc_mask)
@@ -289,7 +289,7 @@ static u64 __cpuidle __halt(const bool irq_disabled)
* can keep the vCPU in virtual HLT, even if an IRQ is
* pending, without hanging/breaking the guest.
*/
- return __tdx_hypercall(&args, 0);
+ return __tdx_hypercall(&args);
}
static int handle_halt(struct ve_info *ve)
@@ -326,7 +326,7 @@ static int read_msr(struct pt_regs *regs, struct ve_info *ve)
* can be found in TDX Guest-Host-Communication Interface
* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
*/
- if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ if (__tdx_hypercall_ret(&args))
return -EIO;
regs->ax = lower_32_bits(args.r11);
@@ -348,7 +348,7 @@ static int write_msr(struct pt_regs *regs, struct ve_info *ve)
* can be found in TDX Guest-Host-Communication Interface
* (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
*/
- if (__tdx_hypercall(&args, 0))
+ if (__tdx_hypercall(&args))
return -EIO;
return ve_instr_len(ve);
@@ -380,7 +380,7 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
* ABI can be found in TDX Guest-Host-Communication Interface
* (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
*/
- if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ if (__tdx_hypercall_ret(&args))
return -EIO;
/*
@@ -407,7 +407,7 @@ static bool mmio_read(int size, unsigned long addr, unsigned long *val)
.r15 = *val,
};
- if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT))
+ if (__tdx_hypercall_ret(&args))
return false;
*val = args.r11;
return true;
@@ -541,7 +541,7 @@ static bool handle_in(struct pt_regs *regs, int size, int port)
* in TDX Guest-Host-Communication Interface (GHCI) section titled
* "TDG.VP.VMCALL<Instruction.IO>".
*/
- success = !__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT);
+ success = !__tdx_hypercall_ret(&args);
/* Update part of the register affected by the emulated instruction */
regs->ax &= ~mask;
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d94d361f506f..f31e286c2977 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -205,7 +205,7 @@ syscall_return_via_sysret:
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
@@ -286,7 +286,7 @@ SYM_FUNC_END(__switch_to_asm)
.pushsection .text, "ax"
__FUNC_ALIGN
SYM_CODE_START_NOALIGN(ret_from_fork)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // copy_thread
CALL_DEPTH_ACCOUNT
movq %rax, %rdi
@@ -303,7 +303,7 @@ SYM_CODE_START_NOALIGN(ret_from_fork)
1:
/* kernel thread */
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
movq %r12, %rdi
CALL_NOSPEC rbx
/*
@@ -388,9 +388,9 @@ SYM_CODE_START(\asmsym)
.if \vector == X86_TRAP_BP
/* #BP advances %rip to the next instruction */
- UNWIND_HINT_IRET_REGS offset=\has_error_code*8 signal=0
+ UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8 signal=0
.else
- UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+ UNWIND_HINT_IRET_ENTRY offset=\has_error_code*8
.endif
ENDBR
@@ -461,7 +461,7 @@ SYM_CODE_END(\asmsym)
*/
.macro idtentry_mce_db vector asmsym cfunc
SYM_CODE_START(\asmsym)
- UNWIND_HINT_IRET_REGS
+ UNWIND_HINT_IRET_ENTRY
ENDBR
ASM_CLAC
cld
@@ -518,7 +518,7 @@ SYM_CODE_END(\asmsym)
*/
.macro idtentry_vc vector asmsym cfunc
SYM_CODE_START(\asmsym)
- UNWIND_HINT_IRET_REGS
+ UNWIND_HINT_IRET_ENTRY
ENDBR
ASM_CLAC
cld
@@ -582,7 +582,7 @@ SYM_CODE_END(\asmsym)
*/
.macro idtentry_df vector asmsym cfunc
SYM_CODE_START(\asmsym)
- UNWIND_HINT_IRET_REGS offset=8
+ UNWIND_HINT_IRET_ENTRY offset=8
ENDBR
ASM_CLAC
cld
@@ -643,7 +643,7 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
@@ -869,7 +869,7 @@ SYM_CODE_END(exc_xen_hypervisor_callback)
*/
__FUNC_ALIGN
SYM_CODE_START_NOALIGN(xen_failsafe_callback)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ENDBR
movl %ds, %ecx
cmpw %cx, 0x10(%rsp)
@@ -1027,7 +1027,7 @@ SYM_CODE_START_LOCAL(paranoid_exit)
*
* NB to anyone to try to optimize this code: this code does
* not execute at all for exceptions from user mode. Those
- * exceptions go through error_exit instead.
+ * exceptions go through error_return instead.
*/
RESTORE_CR3 scratch_reg=%rax save_reg=%r14
@@ -1107,7 +1107,7 @@ SYM_CODE_START(error_entry)
FENCE_SWAPGS_KERNEL_ENTRY
CALL_DEPTH_ACCOUNT
leaq 8(%rsp), %rax /* return pt_regs pointer */
- ANNOTATE_UNRET_END
+ VALIDATE_UNRET_END
RET
.Lbstep_iret:
@@ -1153,7 +1153,7 @@ SYM_CODE_END(error_return)
* when PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
SYM_CODE_START(asm_exc_nmi)
- UNWIND_HINT_IRET_REGS
+ UNWIND_HINT_IRET_ENTRY
ENDBR
/*
@@ -1520,7 +1520,7 @@ SYM_CODE_END(asm_exc_nmi)
* MSRs to fully disable 32-bit SYSCALL.
*/
SYM_CODE_START(ignore_sysret)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ENDBR
mov $-ENOSYS, %eax
sysretl
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index c84d12608cd2..f65c671ce3b1 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -372,6 +372,7 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 64 map_shadow_stack sys_map_shadow_stack
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 1506a22a4fb6..6a1821bd7d5e 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -3,10 +3,7 @@
# Building vDSO images for x86.
#
-# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before
-# the inclusion of generic Makefile.
-ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE|
-ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE
+# Include the generic Makefile to check the built vdso.
include $(srctree)/lib/vdso/Makefile
# Sanitizer runtimes are unavailable and cannot be linked here.
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 3b300a773c7e..f3b3cacbcbb0 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -70,18 +70,9 @@ static struct ctl_table abi_table2[] = {
{}
};
-static struct ctl_table abi_root_table2[] = {
- {
- .procname = "abi",
- .mode = 0555,
- .child = abi_table2
- },
- {}
-};
-
static __init int ia32_binfmt_init(void)
{
- register_sysctl_table(abi_root_table2);
+ register_sysctl("abi", abi_table2);
return 0;
}
__initcall(ia32_binfmt_init);
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index d234ca797e4a..e0ca8120aea8 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -317,7 +317,7 @@ static struct vm_area_struct gate_vma __ro_after_init = {
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef CONFIG_COMPAT
- if (!mm || !(mm->context.flags & MM_CONTEXT_HAS_VSYSCALL))
+ if (!mm || !test_bit(MM_CONTEXT_HAS_VSYSCALL, &mm->context.flags))
return NULL;
#endif
if (vsyscall_mode == NONE)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a3fb996a86a1..070cc4ef2672 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5470,6 +5470,15 @@ pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
}
static umode_t
+mem_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ if (attr == &event_attr_mem_ld_aux.attr.attr)
+ return x86_pmu.flags & PMU_FL_MEM_LOADS_AUX ? attr->mode : 0;
+
+ return pebs_is_visible(kobj, attr, i);
+}
+
+static umode_t
lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{
return x86_pmu.lbr_nr ? attr->mode : 0;
@@ -5496,7 +5505,7 @@ static struct attribute_group group_events_td = {
static struct attribute_group group_events_mem = {
.name = "events",
- .is_visible = pebs_is_visible,
+ .is_visible = mem_is_visible,
};
static struct attribute_group group_events_tsx = {
@@ -6486,6 +6495,10 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_SAPPHIRERAPIDS_X:
case INTEL_FAM6_EMERALDRAPIDS_X:
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+ fallthrough;
+ case INTEL_FAM6_GRANITERAPIDS_X:
+ case INTEL_FAM6_GRANITERAPIDS_D:
pmem = true;
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids));
@@ -6502,7 +6515,6 @@ __init int intel_pmu_init(void)
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
- x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = spr_get_event_constraints;
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 551741e79e03..835862c548cc 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -678,6 +678,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &icx_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_X, &icx_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(GRANITERAPIDS_D, &icx_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index c65d8906cbcf..0feaaa571303 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -70,6 +70,8 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SAPPHIRERAPIDS_X:
case INTEL_FAM6_EMERALDRAPIDS_X:
+ case INTEL_FAM6_GRANITERAPIDS_X:
+ case INTEL_FAM6_GRANITERAPIDS_D:
case INTEL_FAM6_ATOM_SILVERMONT:
case INTEL_FAM6_ATOM_SILVERMONT_D:
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 41ef036ebb7b..edbc67ec1f3e 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -29,7 +29,6 @@
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include <linux/highmem.h>
-#include <linux/swiotlb.h>
int hyperv_init_cpuhp;
u64 hv_current_partition_id = ~0ull;
@@ -504,16 +503,6 @@ void __init hyperv_init(void)
/* Query the VMs extended capability once, so that it can be cached. */
hv_query_ext_cap(0);
-#ifdef CONFIG_SWIOTLB
- /*
- * Swiotlb bounce buffer needs to be mapped in extra address
- * space. Map function doesn't work in the early place and so
- * call swiotlb_update_mem_attributes() here.
- */
- if (hv_is_isolation_supported())
- swiotlb_update_mem_attributes();
-#endif
-
return;
clean_guest_os_id:
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 1dbcbd9da74d..f6a020cb1a24 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -13,6 +13,8 @@
#include <asm/svm.h>
#include <asm/sev.h>
#include <asm/io.h>
+#include <asm/coco.h>
+#include <asm/mem_encrypt.h>
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
@@ -233,41 +235,6 @@ void hv_ghcb_msr_read(u64 msr, u64 *value)
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(hv_ghcb_msr_read);
-#endif
-
-enum hv_isolation_type hv_get_isolation_type(void)
-{
- if (!(ms_hyperv.priv_high & HV_ISOLATION))
- return HV_ISOLATION_TYPE_NONE;
- return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
-}
-EXPORT_SYMBOL_GPL(hv_get_isolation_type);
-
-/*
- * hv_is_isolation_supported - Check system runs in the Hyper-V
- * isolation VM.
- */
-bool hv_is_isolation_supported(void)
-{
- if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
- return false;
-
- if (!hypervisor_is_type(X86_HYPER_MS_HYPERV))
- return false;
-
- return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
-}
-
-DEFINE_STATIC_KEY_FALSE(isolation_type_snp);
-
-/*
- * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based
- * isolation VM.
- */
-bool hv_isolation_type_snp(void)
-{
- return static_branch_unlikely(&isolation_type_snp);
-}
/*
* hv_mark_gpa_visibility - Set pages visible to host via hvcall.
@@ -320,27 +287,25 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
}
/*
- * hv_set_mem_host_visibility - Set specified memory visible to host.
+ * hv_vtom_set_host_visibility - Set specified memory visible to host.
*
* In Isolation VM, all guest memory is encrypted from host and guest
* needs to set memory visible to host via hvcall before sharing memory
* with host. This function works as wrap of hv_mark_gpa_visibility()
* with memory base and size.
*/
-int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visible)
+static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc)
{
- enum hv_mem_host_visibility visibility = visible ?
- VMBUS_PAGE_VISIBLE_READ_WRITE : VMBUS_PAGE_NOT_VISIBLE;
+ enum hv_mem_host_visibility visibility = enc ?
+ VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE;
u64 *pfn_array;
int ret = 0;
+ bool result = true;
int i, pfn;
- if (!hv_is_isolation_supported() || !hv_hypercall_pg)
- return 0;
-
pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
if (!pfn_array)
- return -ENOMEM;
+ return false;
for (i = 0, pfn = 0; i < pagecount; i++) {
pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE);
@@ -349,17 +314,68 @@ int hv_set_mem_host_visibility(unsigned long kbuffer, int pagecount, bool visibl
if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) {
ret = hv_mark_gpa_visibility(pfn, pfn_array,
visibility);
- if (ret)
+ if (ret) {
+ result = false;
goto err_free_pfn_array;
+ }
pfn = 0;
}
}
err_free_pfn_array:
kfree(pfn_array);
- return ret;
+ return result;
+}
+
+static bool hv_vtom_tlb_flush_required(bool private)
+{
+ return true;
}
+static bool hv_vtom_cache_flush_required(void)
+{
+ return false;
+}
+
+static bool hv_is_private_mmio(u64 addr)
+{
+ /*
+ * Hyper-V always provides a single IO-APIC in a guest VM.
+ * When a paravisor is used, it is emulated by the paravisor
+ * in the guest context and must be mapped private.
+ */
+ if (addr >= HV_IOAPIC_BASE_ADDRESS &&
+ addr < (HV_IOAPIC_BASE_ADDRESS + PAGE_SIZE))
+ return true;
+
+ /* Same with a vTPM */
+ if (addr >= VTPM_BASE_ADDRESS &&
+ addr < (VTPM_BASE_ADDRESS + PAGE_SIZE))
+ return true;
+
+ return false;
+}
+
+void __init hv_vtom_init(void)
+{
+ /*
+ * By design, a VM using vTOM doesn't see the SEV setting,
+ * so SEV initialization is bypassed and sev_status isn't set.
+ * Set it here to indicate a vTOM VM.
+ */
+ sev_status = MSR_AMD64_SNP_VTOM;
+ cc_set_vendor(CC_VENDOR_AMD);
+ cc_set_mask(ms_hyperv.shared_gpa_boundary);
+ physical_mask &= ms_hyperv.shared_gpa_boundary - 1;
+
+ x86_platform.hyper.is_private_mmio = hv_is_private_mmio;
+ x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required;
+ x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
+ x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
+}
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
/*
* hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM.
*/
@@ -377,7 +393,7 @@ void *hv_map_memory(void *addr, unsigned long size)
pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) +
(ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT);
- vaddr = vmap_pfn(pfns, size / PAGE_SIZE, PAGE_KERNEL_IO);
+ vaddr = vmap_pfn(pfns, size / PAGE_SIZE, pgprot_decrypted(PAGE_KERNEL));
kfree(pfns);
return vaddr;
@@ -387,3 +403,37 @@ void hv_unmap_memory(void *addr)
{
vunmap(addr);
}
+
+enum hv_isolation_type hv_get_isolation_type(void)
+{
+ if (!(ms_hyperv.priv_high & HV_ISOLATION))
+ return HV_ISOLATION_TYPE_NONE;
+ return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
+}
+EXPORT_SYMBOL_GPL(hv_get_isolation_type);
+
+/*
+ * hv_is_isolation_supported - Check system runs in the Hyper-V
+ * isolation VM.
+ */
+bool hv_is_isolation_supported(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ if (!hypervisor_is_type(X86_HYPER_MS_HYPERV))
+ return false;
+
+ return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
+}
+
+DEFINE_STATIC_KEY_FALSE(isolation_type_snp);
+
+/*
+ * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based
+ * isolation VM.
+ */
+bool hv_isolation_type_snp(void)
+{
+ return static_branch_unlikely(&isolation_type_snp);
+}
diff --git a/arch/x86/include/asm/coco.h b/arch/x86/include/asm/coco.h
index 3d98c3a60d34..eb08796002f3 100644
--- a/arch/x86/include/asm/coco.h
+++ b/arch/x86/include/asm/coco.h
@@ -7,17 +7,33 @@
enum cc_vendor {
CC_VENDOR_NONE,
CC_VENDOR_AMD,
- CC_VENDOR_HYPERV,
CC_VENDOR_INTEL,
};
-void cc_set_vendor(enum cc_vendor v);
-void cc_set_mask(u64 mask);
-
#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+extern enum cc_vendor cc_vendor;
+
+static inline enum cc_vendor cc_get_vendor(void)
+{
+ return cc_vendor;
+}
+
+static inline void cc_set_vendor(enum cc_vendor vendor)
+{
+ cc_vendor = vendor;
+}
+
+void cc_set_mask(u64 mask);
u64 cc_mkenc(u64 val);
u64 cc_mkdec(u64 val);
#else
+static inline enum cc_vendor cc_get_vendor(void)
+{
+ return CC_VENDOR_NONE;
+}
+
+static inline void cc_set_vendor(enum cc_vendor vendor) { }
+
static inline u64 cc_mkenc(u64 val)
{
return val;
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 73c9672c123b..d9c190cdefa9 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -309,6 +309,7 @@
#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
+#define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
@@ -321,6 +322,7 @@
#define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */
#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */
#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
+#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
@@ -378,6 +380,7 @@
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */
#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_SHSTK (16*32+ 7) /* "" Shadow stack */
#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 5dfa4fb76f4b..702d93fdd10e 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -75,6 +75,12 @@
# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31))
#endif
+#ifdef CONFIG_ADDRESS_MASKING
+# define DISABLE_LAM 0
+#else
+# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31))
+#endif
+
#ifdef CONFIG_INTEL_IOMMU_SVM
# define DISABLE_ENQCMD 0
#else
@@ -99,6 +105,18 @@
# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31))
#endif
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+#define DISABLE_USER_SHSTK 0
+#else
+#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31))
+#endif
+
+#ifdef CONFIG_X86_KERNEL_IBT
+#define DISABLE_IBT 0
+#else
+#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31))
+#endif
+
/*
* Make sure to add features to the correct mask
*/
@@ -114,15 +132,15 @@
#define DISABLED_MASK9 (DISABLE_SGX)
#define DISABLED_MASK10 0
#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
- DISABLE_CALL_DEPTH_TRACKING)
-#define DISABLED_MASK12 0
+ DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK)
+#define DISABLED_MASK12 (DISABLE_LAM)
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
DISABLE_ENQCMD)
#define DISABLED_MASK17 0
-#define DISABLED_MASK18 0
+#define DISABLED_MASK18 (DISABLE_IBT)
#define DISABLED_MASK19 0
#define DISABLED_MASK20 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21)
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 503a577814b2..aadc6893dcaa 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -82,6 +82,15 @@ static inline void fpregs_unlock(void)
preempt_enable();
}
+/*
+ * FPU state gets lazily restored before returning to userspace. So when in the
+ * kernel, the valid FPU state may be kept in the buffer. This function will force
+ * restore all the fpu state to the registers early if needed, and lock them from
+ * being automatically saved/restored. Then FPU state can be modified safely in the
+ * registers, before unlocking with fpregs_unlock().
+ */
+void fpregs_lock_and_load(void);
+
#ifdef CONFIG_X86_DEBUG_FPU
extern void fpregs_assert_state_consistent(void);
#else
diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h
index 4f928d6a367b..697b77e96025 100644
--- a/arch/x86/include/asm/fpu/regset.h
+++ b/arch/x86/include/asm/fpu/regset.h
@@ -7,11 +7,12 @@
#include <linux/regset.h>
-extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active;
+extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active,
+ ssp_active;
extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get,
- xstateregs_get;
+ xstateregs_get, ssp_get;
extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
- xstateregs_set;
+ xstateregs_set, ssp_set;
/*
* xstateregs_active == regset_fpregs_active. Please refer to the comment
diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h
index c2d6cd78ed0c..3c2903bbb456 100644
--- a/arch/x86/include/asm/fpu/sched.h
+++ b/arch/x86/include/asm/fpu/sched.h
@@ -11,7 +11,8 @@
extern void save_fpregs_to_fpstate(struct fpu *fpu);
extern void fpu__drop(struct fpu *fpu);
-extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal);
+extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
+ unsigned long shstk_addr);
extern void fpu_flush_thread(void);
/*
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 7f6d858ff47a..eb810074f1e7 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -115,8 +115,8 @@ enum xfeature {
XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
XFEATURE_PKRU,
XFEATURE_PASID,
- XFEATURE_RSRVD_COMP_11,
- XFEATURE_RSRVD_COMP_12,
+ XFEATURE_CET_USER,
+ XFEATURE_CET_KERNEL_UNUSED,
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
@@ -138,6 +138,8 @@ enum xfeature {
#define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
#define XFEATURE_MASK_PASID (1 << XFEATURE_PASID)
+#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER)
+#define XFEATURE_MASK_CET_KERNEL (1 << XFEATURE_CET_KERNEL_UNUSED)
#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR)
#define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG)
#define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA)
@@ -253,6 +255,16 @@ struct pkru_state {
} __packed;
/*
+ * State component 11 is Control-flow Enforcement user states
+ */
+struct cet_user_state {
+ /* user control-flow settings */
+ u64 user_cet;
+ /* user shadow stack pointer */
+ u64 user_ssp;
+};
+
+/*
* State component 15: Architectural LBR configuration state.
* The size of Arch LBR state depends on the number of LBRs (lbr_depth).
*/
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index cd3dd170e23a..d4427b88ee12 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -50,7 +50,8 @@
#define XFEATURE_MASK_USER_DYNAMIC XFEATURE_MASK_XTILE_DATA
/* All currently supported supervisor features */
-#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
+#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
+ XFEATURE_MASK_CET_USER)
/*
* A supervisor state component may not always contain valuable information,
@@ -77,7 +78,8 @@
* Unsupported supervisor features. When a supervisor feature in this mask is
* supported in the future, move it to the supported supervisor feature mask.
*/
-#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \
+ XFEATURE_MASK_CET_KERNEL)
/* All supervisor states including supported and unsupported states. */
#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index b241af4ce9b4..61e0e6301f09 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -614,7 +614,7 @@ DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_DF, xenpv_exc_double_fault);
#endif
/* #CP */
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection);
#endif
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index c201083b34f6..a3abdcd89a32 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -20,25 +20,4 @@ extern void intel_mid_pwr_power_off(void);
extern int intel_mid_pwr_get_lss_id(struct pci_dev *pdev);
-#ifdef CONFIG_X86_INTEL_MID
-
-extern void intel_scu_devices_create(void);
-extern void intel_scu_devices_destroy(void);
-
-#else /* !CONFIG_X86_INTEL_MID */
-
-static inline void intel_scu_devices_create(void) { }
-static inline void intel_scu_devices_destroy(void) { }
-
-#endif /* !CONFIG_X86_INTEL_MID */
-
-/* Bus Select SoC Fuse value */
-#define BSEL_SOC_FUSE_MASK 0x7
-/* FSB 133MHz */
-#define BSEL_SOC_FUSE_001 0x1
-/* FSB 100MHz */
-#define BSEL_SOC_FUSE_101 0x5
-/* FSB 83MHz */
-#define BSEL_SOC_FUSE_111 0x7
-
#endif /* _ASM_X86_INTEL_MID_H */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 72ca90552b6a..b7126701574c 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -56,6 +56,7 @@ void __init sev_es_init_vc_handling(void);
#else /* !CONFIG_AMD_MEM_ENCRYPT */
#define sme_me_mask 0ULL
+#define sev_status 0ULL
static inline void __init sme_early_encrypt(resource_size_t paddr,
unsigned long size) { }
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5d7494631ea9..0da5c227f490 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,9 +9,13 @@
#include <linux/bits.h>
/* Uprobes on this MM assume 32-bit code */
-#define MM_CONTEXT_UPROBE_IA32 BIT(0)
+#define MM_CONTEXT_UPROBE_IA32 0
/* vsyscall page is accessible on this MM */
-#define MM_CONTEXT_HAS_VSYSCALL BIT(1)
+#define MM_CONTEXT_HAS_VSYSCALL 1
+/* Do not allow changing LAM mode */
+#define MM_CONTEXT_LOCK_LAM 2
+/* Allow LAM and SVA coexisting */
+#define MM_CONTEXT_FORCE_TAGGED_SVA 3
/*
* x86 has arch-specific MMU state beyond what lives in mm_struct.
@@ -39,7 +43,15 @@ typedef struct {
#endif
#ifdef CONFIG_X86_64
- unsigned short flags;
+ unsigned long flags;
+#endif
+
+#ifdef CONFIG_ADDRESS_MASKING
+ /* Active LAM mode: X86_CR3_LAM_U48 or X86_CR3_LAM_U57 or 0 (disabled) */
+ unsigned long lam_cr3_mask;
+
+ /* Significant bits of the virtual address. Excludes tag bits. */
+ u64 untag_mask;
#endif
struct mutex lock;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index e01aa74a6de7..416901d406f8 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -16,13 +16,6 @@
extern atomic64_t last_mm_ctx_id;
-#ifndef CONFIG_PARAVIRT_XXL
-static inline void paravirt_activate_mm(struct mm_struct *prev,
- struct mm_struct *next)
-{
-}
-#endif /* !CONFIG_PARAVIRT_XXL */
-
#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
@@ -92,6 +85,51 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
}
#endif
+#ifdef CONFIG_ADDRESS_MASKING
+static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
+{
+ return mm->context.lam_cr3_mask;
+}
+
+static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+ mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
+ mm->context.untag_mask = oldmm->context.untag_mask;
+}
+
+#define mm_untag_mask mm_untag_mask
+static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+{
+ return mm->context.untag_mask;
+}
+
+static inline void mm_reset_untag_mask(struct mm_struct *mm)
+{
+ mm->context.untag_mask = -1UL;
+}
+
+#define arch_pgtable_dma_compat arch_pgtable_dma_compat
+static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
+{
+ return !mm_lam_cr3_mask(mm) ||
+ test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
+}
+#else
+
+static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
+{
+ return 0;
+}
+
+static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+}
+
+static inline void mm_reset_untag_mask(struct mm_struct *mm)
+{
+}
+#endif
+
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
@@ -116,6 +154,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.execute_only_pkey = -1;
}
#endif
+ mm_reset_untag_mask(mm);
init_new_context_ldt(mm);
return 0;
}
@@ -135,7 +174,7 @@ extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
#define activate_mm(prev, next) \
do { \
- paravirt_activate_mm((prev), (next)); \
+ paravirt_enter_mmap(next); \
switch_mm((prev), (next), NULL); \
} while (0);
@@ -147,6 +186,8 @@ do { \
#else
#define deactivate_mm(tsk, mm) \
do { \
+ if (!tsk->vfork_done) \
+ shstk_free(tsk); \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
@@ -168,7 +209,8 @@ static inline void arch_dup_pkeys(struct mm_struct *oldmm,
static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
arch_dup_pkeys(oldmm, mm);
- paravirt_arch_dup_mmap(oldmm, mm);
+ paravirt_enter_mmap(mm);
+ dup_lam(oldmm, mm);
return ldt_dup_context(oldmm, mm);
}
@@ -182,7 +224,7 @@ static inline void arch_exit_mmap(struct mm_struct *mm)
static inline bool is_64bit_mm(struct mm_struct *mm)
{
return !IS_ENABLED(CONFIG_IA32_EMULATION) ||
- !(mm->context.flags & MM_CONTEXT_UPROBE_IA32);
+ !test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 4c4c0ec3b62e..e3cef98a0142 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -11,6 +11,14 @@
#include <asm/paravirt.h>
#include <asm/mshyperv.h>
+/*
+ * Hyper-V always provides a single IO-APIC at this MMIO address.
+ * Ideally, the value should be looked up in ACPI tables, but it
+ * is needed for mapping the IO-APIC early in boot on Confidential
+ * VMs, before ACPI functions can be used.
+ */
+#define HV_IOAPIC_BASE_ADDRESS 0xfec00000
+
union hv_ghcb;
DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
@@ -206,18 +214,19 @@ struct irq_domain *hv_create_pci_msi_domain(void);
int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
struct hv_interrupt_entry *entry);
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
-int hv_set_mem_host_visibility(unsigned long addr, int numpages, bool visible);
#ifdef CONFIG_AMD_MEM_ENCRYPT
void hv_ghcb_msr_write(u64 msr, u64 value);
void hv_ghcb_msr_read(u64 msr, u64 *value);
bool hv_ghcb_negotiate_protocol(void);
void hv_ghcb_terminate(unsigned int set, unsigned int reason);
+void hv_vtom_init(void);
#else
static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
+static inline void hv_vtom_init(void) {}
#endif
extern bool hv_isolation_type_snp(void);
@@ -259,11 +268,6 @@ static inline void hv_set_register(unsigned int reg, u64 value) { }
static inline u64 hv_get_register(unsigned int reg) { return 0; }
static inline void hv_set_non_nested_register(unsigned int reg, u64 value) { }
static inline u64 hv_get_non_nested_register(unsigned int reg) { return 0; }
-static inline int hv_set_mem_host_visibility(unsigned long addr, int numpages,
- bool visible)
-{
- return -1;
-}
#endif /* CONFIG_HYPERV */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 3ef70e54a858..edb2b0cb8efe 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -194,9 +194,9 @@
* builds.
*/
.macro ANNOTATE_RETPOLINE_SAFE
- .Lannotate_\@:
+.Lhere_\@:
.pushsection .discard.retpoline_safe
- _ASM_PTR .Lannotate_\@
+ .long .Lhere_\@ - .
.popsection
.endm
@@ -210,8 +210,8 @@
* Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should
* eventually turn into it's own annotation.
*/
-.macro ANNOTATE_UNRET_END
-#ifdef CONFIG_DEBUG_ENTRY
+.macro VALIDATE_UNRET_END
+#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY)
ANNOTATE_RETPOLINE_SAFE
nop
#endif
@@ -286,7 +286,7 @@
.macro UNTRAIN_RET
#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
defined(CONFIG_CALL_DEPTH_TRACKING)
- ANNOTATE_UNRET_END
+ VALIDATE_UNRET_END
ALTERNATIVE_3 "", \
CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
"call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
@@ -297,7 +297,7 @@
.macro UNTRAIN_RET_FROM_CALL
#if defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_IBPB_ENTRY) || \
defined(CONFIG_CALL_DEPTH_TRACKING)
- ANNOTATE_UNRET_END
+ VALIDATE_UNRET_END
ALTERNATIVE_3 "", \
CALL_ZEN_UNTRAIN_RET, X86_FEATURE_UNRET, \
"call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \
@@ -318,7 +318,7 @@
#define ANNOTATE_RETPOLINE_SAFE \
"999:\n\t" \
".pushsection .discard.retpoline_safe\n\t" \
- _ASM_PTR " 999b\n\t" \
+ ".long 999b - .\n\t" \
".popsection\n\t"
typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h
index 1343a62106de..46d7e06763c9 100644
--- a/arch/x86/include/asm/orc_types.h
+++ b/arch/x86/include/asm/orc_types.h
@@ -39,6 +39,12 @@
#define ORC_REG_SP_INDIRECT 9
#define ORC_REG_MAX 15
+#define ORC_TYPE_UNDEFINED 0
+#define ORC_TYPE_END_OF_STACK 1
+#define ORC_TYPE_CALL 2
+#define ORC_TYPE_REGS 3
+#define ORC_TYPE_REGS_PARTIAL 4
+
#ifndef __ASSEMBLY__
#include <asm/byteorder.h>
@@ -56,16 +62,14 @@ struct orc_entry {
#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned sp_reg:4;
unsigned bp_reg:4;
- unsigned type:2;
+ unsigned type:3;
unsigned signal:1;
- unsigned end:1;
#elif defined(__BIG_ENDIAN_BITFIELD)
unsigned bp_reg:4;
unsigned sp_reg:4;
unsigned unused:4;
- unsigned end:1;
unsigned signal:1;
- unsigned type:2;
+ unsigned type:3;
#endif
} __packed;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index cf40e813b3d7..b49778664d2b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -334,16 +334,9 @@ static inline void tss_update_io_bitmap(void)
}
#endif
-static inline void paravirt_activate_mm(struct mm_struct *prev,
- struct mm_struct *next)
+static inline void paravirt_enter_mmap(struct mm_struct *next)
{
- PVOP_VCALL2(mmu.activate_mm, prev, next);
-}
-
-static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
-{
- PVOP_VCALL2(mmu.dup_mmap, oldmm, mm);
+ PVOP_VCALL1(mmu.enter_mmap, next);
}
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
@@ -789,8 +782,7 @@ extern void default_banner(void);
#ifndef __ASSEMBLY__
#ifndef CONFIG_PARAVIRT_XXL
-static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
- struct mm_struct *mm)
+static inline void paravirt_enter_mmap(struct mm_struct *mm)
{
}
#endif
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 8c1da419260f..4acbcddddc29 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -164,11 +164,8 @@ struct pv_mmu_ops {
unsigned long (*read_cr3)(void);
void (*write_cr3)(unsigned long);
- /* Hooks for intercepting the creation/use of an mm_struct. */
- void (*activate_mm)(struct mm_struct *prev,
- struct mm_struct *next);
- void (*dup_mmap)(struct mm_struct *oldmm,
- struct mm_struct *mm);
+ /* Hook for intercepting the creation/use of an mm_struct. */
+ void (*enter_mmap)(struct mm_struct *mm);
/* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm);
@@ -562,8 +559,14 @@ void paravirt_flush_lazy_mmu(void);
void _paravirt_nop(void);
void paravirt_BUG(void);
-u64 _paravirt_ident_64(u64);
unsigned long paravirt_ret0(void);
+#ifdef CONFIG_PARAVIRT_XXL
+u64 _paravirt_ident_64(u64);
+unsigned long pv_native_save_fl(void);
+void pv_native_irq_disable(void);
+void pv_native_irq_enable(void);
+unsigned long pv_native_read_cr2(void);
+#endif
#define paravirt_nop ((void *)_paravirt_nop)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 15ae4d6ba476..6f57f285d007 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -124,9 +124,17 @@ extern pmdval_t early_pmd_flags;
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
-static inline int pte_dirty(pte_t pte)
+static inline bool pte_dirty(pte_t pte)
{
- return pte_flags(pte) & _PAGE_DIRTY;
+ return pte_flags(pte) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pte_shstk(pte_t pte)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return false;
+
+ return (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
}
static inline int pte_young(pte_t pte)
@@ -134,9 +142,18 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
-static inline int pmd_dirty(pmd_t pmd)
+static inline bool pmd_dirty(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pmd_shstk(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_DIRTY;
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return false;
+
+ return (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY | _PAGE_PSE)) ==
+ (_PAGE_DIRTY | _PAGE_PSE);
}
#define pmd_young pmd_young
@@ -145,9 +162,9 @@ static inline int pmd_young(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_ACCESSED;
}
-static inline int pud_dirty(pud_t pud)
+static inline bool pud_dirty(pud_t pud)
{
- return pud_flags(pud) & _PAGE_DIRTY;
+ return pud_flags(pud) & _PAGE_DIRTY_BITS;
}
static inline int pud_young(pud_t pud)
@@ -157,7 +174,27 @@ static inline int pud_young(pud_t pud)
static inline int pte_write(pte_t pte)
{
- return pte_flags(pte) & _PAGE_RW;
+ /*
+ * Shadow stack pages are logically writable, but do not have
+ * _PAGE_RW. Check for them separately from _PAGE_RW itself.
+ */
+ return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
+}
+
+#define pmd_write pmd_write
+static inline int pmd_write(pmd_t pmd)
+{
+ /*
+ * Shadow stack pages are logically writable, but do not have
+ * _PAGE_RW. Check for them separately from _PAGE_RW itself.
+ */
+ return (pmd_flags(pmd) & _PAGE_RW) || pmd_shstk(pmd);
+}
+
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_RW;
}
static inline int pte_huge(pte_t pte)
@@ -289,9 +326,57 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear)
return native_make_pte(v & ~clear);
}
+/*
+ * Write protection operations can result in Dirty=1,Write=0 PTEs. But in the
+ * case of X86_FEATURE_USER_SHSTK, the software SavedDirty bit is used, since
+ * the Dirty=1,Write=0 will result in the memory being treated as shadow stack
+ * by the HW. So when creating dirty, write-protected memory, a software bit is
+ * used _PAGE_BIT_SAVED_DIRTY. The following functions pte_mksaveddirty() and
+ * pte_clear_saveddirty() take a conventional dirty, write-protected PTE
+ * (Write=0,Dirty=1) and transition it to the shadow stack compatible
+ * version. (Write=0,SavedDirty=1).
+ */
+static inline pte_t pte_mksaveddirty(pte_t pte)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return pte;
+
+ pte = pte_clear_flags(pte, _PAGE_DIRTY);
+ return pte_set_flags(pte, _PAGE_SAVED_DIRTY);
+}
+
+static inline pte_t pte_clear_saveddirty(pte_t pte)
+{
+ /*
+ * _PAGE_SAVED_DIRTY is unnecessary on !X86_FEATURE_USER_SHSTK kernels,
+ * since the HW dirty bit can be used without creating shadow stack
+ * memory. See the _PAGE_SAVED_DIRTY definition for more details.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return pte;
+
+ /*
+ * PTE is getting copied-on-write, so it will be dirtied
+ * if writable, or made shadow stack if shadow stack and
+ * being copied on access. Set the dirty bit for both
+ * cases.
+ */
+ pte = pte_set_flags(pte, _PAGE_DIRTY);
+ return pte_clear_flags(pte, _PAGE_SAVED_DIRTY);
+}
+
static inline pte_t pte_wrprotect(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_RW);
+ pte = pte_clear_flags(pte, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PTE (Write=0,Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ if (pte_dirty(pte))
+ pte = pte_mksaveddirty(pte);
+ return pte;
}
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
@@ -329,7 +414,7 @@ static inline pte_t pte_clear_uffd_wp(pte_t pte)
static inline pte_t pte_mkclean(pte_t pte)
{
- return pte_clear_flags(pte, _PAGE_DIRTY);
+ return pte_clear_flags(pte, _PAGE_DIRTY_BITS);
}
static inline pte_t pte_mkold(pte_t pte)
@@ -344,7 +429,19 @@ static inline pte_t pte_mkexec(pte_t pte)
static inline pte_t pte_mkdirty(pte_t pte)
{
- return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pteval_t dirty = _PAGE_DIRTY;
+
+ /* Avoid creating Dirty=1,Write=0 PTEs */
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK) && !pte_write(pte))
+ dirty = _PAGE_SAVED_DIRTY;
+
+ return pte_set_flags(pte, dirty | _PAGE_SOFT_DIRTY);
+}
+
+static inline pte_t pte_mkwrite_shstk(pte_t pte)
+{
+ /* pte_clear_saveddirty() also sets Dirty=1 */
+ return pte_clear_saveddirty(pte);
}
static inline pte_t pte_mkyoung(pte_t pte)
@@ -352,11 +449,15 @@ static inline pte_t pte_mkyoung(pte_t pte)
return pte_set_flags(pte, _PAGE_ACCESSED);
}
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite_kernel(pte_t pte)
{
return pte_set_flags(pte, _PAGE_RW);
}
+struct vm_area_struct;
+
+pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma);
+
static inline pte_t pte_mkhuge(pte_t pte)
{
return pte_set_flags(pte, _PAGE_PSE);
@@ -401,9 +502,37 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
return native_make_pmd(v & ~clear);
}
+/* See comments above pte_mksaveddirty() */
+static inline pmd_t pmd_mksaveddirty(pmd_t pmd)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return pmd;
+
+ pmd = pmd_clear_flags(pmd, _PAGE_DIRTY);
+ return pmd_set_flags(pmd, _PAGE_SAVED_DIRTY);
+}
+
+/* See comments above pte_mksaveddirty() */
+static inline pmd_t pmd_clear_saveddirty(pmd_t pmd)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return pmd;
+
+ pmd = pmd_set_flags(pmd, _PAGE_DIRTY);
+ return pmd_clear_flags(pmd, _PAGE_SAVED_DIRTY);
+}
+
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_RW);
+ pmd = pmd_clear_flags(pmd, _PAGE_RW);
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PMD (RW=0, Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ if (pmd_dirty(pmd))
+ pmd = pmd_mksaveddirty(pmd);
+ return pmd;
}
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
@@ -430,12 +559,23 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
static inline pmd_t pmd_mkclean(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_DIRTY);
+ return pmd_clear_flags(pmd, _PAGE_DIRTY_BITS);
}
static inline pmd_t pmd_mkdirty(pmd_t pmd)
{
- return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pmdval_t dirty = _PAGE_DIRTY;
+
+ /* Avoid creating (HW)Dirty=1, Write=0 PMDs */
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK) && !pmd_write(pmd))
+ dirty = _PAGE_SAVED_DIRTY;
+
+ return pmd_set_flags(pmd, dirty | _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_mkwrite_shstk(pmd_t pmd)
+{
+ return pmd_clear_saveddirty(pmd);
}
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
@@ -453,10 +593,7 @@ static inline pmd_t pmd_mkyoung(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_ACCESSED);
}
-static inline pmd_t pmd_mkwrite(pmd_t pmd)
-{
- return pmd_set_flags(pmd, _PAGE_RW);
-}
+pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
{
@@ -472,6 +609,26 @@ static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
return native_make_pud(v & ~clear);
}
+/* See comments above pte_mksaveddirty() */
+static inline pud_t pud_mksaveddirty(pud_t pud)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return pud;
+
+ pud = pud_clear_flags(pud, _PAGE_DIRTY);
+ return pud_set_flags(pud, _PAGE_SAVED_DIRTY);
+}
+
+/* See comments above pte_mksaveddirty() */
+static inline pud_t pud_clear_saveddirty(pud_t pud)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return pud;
+
+ pud = pud_set_flags(pud, _PAGE_DIRTY);
+ return pud_clear_flags(pud, _PAGE_SAVED_DIRTY);
+}
+
static inline pud_t pud_mkold(pud_t pud)
{
return pud_clear_flags(pud, _PAGE_ACCESSED);
@@ -479,17 +636,32 @@ static inline pud_t pud_mkold(pud_t pud)
static inline pud_t pud_mkclean(pud_t pud)
{
- return pud_clear_flags(pud, _PAGE_DIRTY);
+ return pud_clear_flags(pud, _PAGE_DIRTY_BITS);
}
static inline pud_t pud_wrprotect(pud_t pud)
{
- return pud_clear_flags(pud, _PAGE_RW);
+ pud = pud_clear_flags(pud, _PAGE_RW);
+
+ /*
+ * Blindly clearing _PAGE_RW might accidentally create
+ * a shadow stack PUD (RW=0, Dirty=1). Move the hardware
+ * dirty value to the software bit.
+ */
+ if (pud_dirty(pud))
+ pud = pud_mksaveddirty(pud);
+ return pud;
}
static inline pud_t pud_mkdirty(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
+ pudval_t dirty = _PAGE_DIRTY;
+
+ /* Avoid creating (HW)Dirty=1, Write=0 PUDs */
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK) && !pud_write(pud))
+ dirty = _PAGE_SAVED_DIRTY;
+
+ return pud_set_flags(pud, dirty | _PAGE_SOFT_DIRTY);
}
static inline pud_t pud_mkdevmap(pud_t pud)
@@ -509,7 +681,11 @@ static inline pud_t pud_mkyoung(pud_t pud)
static inline pud_t pud_mkwrite(pud_t pud)
{
- return pud_set_flags(pud, _PAGE_RW);
+ pud = pud_set_flags(pud, _PAGE_RW);
+
+ if (pud_dirty(pud))
+ pud = pud_clear_saveddirty(pud);
+ return pud;
}
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -626,6 +802,8 @@ static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
pteval_t val = pte_val(pte), oldval = val;
+ bool wr_protected;
+ pte_t pte_result;
/*
* Chop off the NX bit (if present), and add the NX portion of
@@ -634,17 +812,43 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
val &= _PAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
- return __pte(val);
+
+ pte_result = __pte(val);
+
+ /*
+ * Do the saveddirty fixup if the PTE was just write protected and
+ * it's dirty.
+ */
+ wr_protected = (oldval & _PAGE_RW) && !(val & _PAGE_RW);
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK) && wr_protected &&
+ (val & _PAGE_DIRTY))
+ pte_result = pte_mksaveddirty(pte_result);
+
+ return pte_result;
}
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
pmdval_t val = pmd_val(pmd), oldval = val;
+ bool wr_protected;
+ pmd_t pmd_result;
- val &= _HPAGE_CHG_MASK;
+ val &= (_HPAGE_CHG_MASK & ~_PAGE_DIRTY);
val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
- return __pmd(val);
+
+ pmd_result = __pmd(val);
+
+ /*
+ * Do the saveddirty fixup if the PMD was just write protected and
+ * it's dirty.
+ */
+ wr_protected = (oldval & _PAGE_RW) && !(val & _PAGE_RW);
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK) && wr_protected &&
+ (val & _PAGE_DIRTY))
+ pmd_result = pmd_mksaveddirty(pmd_result);
+
+ return pmd_result;
}
/*
@@ -828,7 +1032,15 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
* (Currently stuck as a macro because of indirect forward reference
* to linux/mm.h:page_to_nid())
*/
-#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
+#define mk_pte(page, pgprot) \
+({ \
+ pgprot_t __pgprot = pgprot; \
+ \
+ WARN_ON_ONCE(cpu_feature_enabled(X86_FEATURE_USER_SHSTK) && \
+ (pgprot_val(__pgprot) & (_PAGE_DIRTY | _PAGE_RW)) == \
+ _PAGE_DIRTY); \
+ pfn_pte(page_to_pfn(page), __pgprot); \
+})
static inline int pmd_bad(pmd_t pmd)
{
@@ -1094,6 +1306,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ /*
+ * Avoid accidentally creating shadow stack PTEs
+ * (Write=0,Dirty=1). Use cmpxchg() to prevent races with
+ * the hardware setting Dirty=1.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) {
+ pte_t old_pte, new_pte;
+
+ old_pte = READ_ONCE(*ptep);
+ do {
+ new_pte = pte_wrprotect(old_pte);
+ } while (!try_cmpxchg(&ptep->pte, &old_pte.pte, new_pte.pte));
+
+ return;
+ }
+#endif
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
}
@@ -1120,12 +1349,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
-#define pmd_write pmd_write
-static inline int pmd_write(pmd_t pmd)
-{
- return pmd_flags(pmd) & _PAGE_RW;
-}
-
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
@@ -1152,13 +1375,25 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
- clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
-}
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ /*
+ * Avoid accidentally creating shadow stack PTEs
+ * (Write=0,Dirty=1). Use cmpxchg() to prevent races with
+ * the hardware setting Dirty=1.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) {
+ pmd_t old_pmd, new_pmd;
-#define pud_write pud_write
-static inline int pud_write(pud_t pud)
-{
- return pud_flags(pud) & _PAGE_RW;
+ old_pmd = READ_ONCE(*pmdp);
+ do {
+ new_pmd = pmd_wrprotect(old_pmd);
+ } while (!try_cmpxchg(&pmdp->pmd, &old_pmd.pmd, new_pmd.pmd));
+
+ return;
+ }
+#endif
+
+ clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
}
#ifndef pmdp_establish
@@ -1411,6 +1646,11 @@ static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+ /*
+ * Write=0,Dirty=1 PTEs are shadow stack, which the kernel
+ * shouldn't generally allow access to, but since they
+ * are already Write=0, the below logic covers both cases.
+ */
if (write)
need_pte_bits |= _PAGE_RW;
@@ -1452,6 +1692,12 @@ static inline bool arch_has_hw_pte_young(void)
return true;
}
+#define arch_check_zapped_pte arch_check_zapped_pte
+void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte);
+
+#define arch_check_zapped_pmd arch_check_zapped_pmd
+void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd);
+
#ifdef CONFIG_XEN_PV
#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
static inline bool arch_has_hw_nonleaf_pmd_young(void)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 447d4bee25c4..8f266788c0d7 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -21,7 +21,8 @@
#define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
-#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
+#define _PAGE_BIT_SOFTW4 57 /* available for programmer */
+#define _PAGE_BIT_SOFTW5 58 /* available for programmer */
#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
@@ -34,6 +35,15 @@
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
+/*
+ * Indicates a Saved Dirty bit page.
+ */
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+#define _PAGE_BIT_SAVED_DIRTY _PAGE_BIT_SOFTW5 /* Saved Dirty bit */
+#else
+#define _PAGE_BIT_SAVED_DIRTY 0
+#endif
+
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@@ -117,6 +127,25 @@
#define _PAGE_SOFTW4 (_AT(pteval_t, 0))
#endif
+/*
+ * The hardware requires shadow stack to be Write=0,Dirty=1. However,
+ * there are valid cases where the kernel might create read-only PTEs that
+ * are dirty (e.g., fork(), mprotect(), uffd-wp(), soft-dirty tracking). In
+ * this case, the _PAGE_SAVED_DIRTY bit is used instead of the HW-dirty bit,
+ * to avoid creating a wrong "shadow stack" PTEs. Such PTEs have
+ * (Write=0,SavedDirty=1,Dirty=0) set.
+ *
+ * Note that on processors without shadow stack support, the
+ * _PAGE_SAVED_DIRTY remains unused.
+ */
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+#define _PAGE_SAVED_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SAVED_DIRTY)
+#else
+#define _PAGE_SAVED_DIRTY (_AT(pteval_t, 0))
+#endif
+
+#define _PAGE_DIRTY_BITS (_PAGE_DIRTY | _PAGE_SAVED_DIRTY)
+
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
/*
@@ -125,9 +154,9 @@
* instance, and is *not* included in this mask since
* pte_modify() does modify it.
*/
-#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
- _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
- _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \
+#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
+ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY_BITS | \
+ _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \
_PAGE_UFFD_WP)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
@@ -186,16 +215,21 @@ enum page_cache_mode {
#define PAGE_READONLY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0)
#define PAGE_READONLY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0)
-#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G)
-#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G)
-#define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0)
-#define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC)
+/*
+ * Page tables needs to have Write=1 in order for any lower PTEs to be
+ * writable. This includes shadow stack memory (Write=0, Dirty=1)
+ */
#define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0)
#define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC)
-#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G)
-#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G)
+#define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0)
+#define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC)
+
+#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G)
+#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G)
+#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX| 0| 0|___G)
+#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0| 0| 0|___G)
#define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC)
-#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G)
+#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX| 0| 0|___G)
#define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G)
#define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G)
#define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP)
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index a7f3d9100adb..d8cccadc83a6 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -28,6 +28,8 @@
* On systems with SME, one bit (in a variable position!) is stolen to indicate
* that the top-level paging structure is encrypted.
*
+ * On systemms with LAM, bits 61 and 62 are used to indicate LAM mode.
+ *
* All of the remaining bits indicate the physical address of the top-level
* paging structure.
*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 8d73004e4cac..2a5ec5750ba7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -28,6 +28,7 @@ struct vm86;
#include <asm/unwind_hints.h>
#include <asm/vmxfeatures.h>
#include <asm/vdso/processor.h>
+#include <asm/shstk.h>
#include <linux/personality.h>
#include <linux/cache.h>
@@ -475,6 +476,13 @@ struct thread_struct {
*/
u32 pkru;
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ unsigned long features;
+ unsigned long features_locked;
+
+ struct thread_shstk shstk;
+#endif
+
/* Floating point and extended processor state */
struct fpu fpu;
/*
@@ -647,7 +655,11 @@ static inline void spin_lock_prefetch(const void *x)
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
#else
-#define INIT_THREAD { }
+extern unsigned long __end_init_task[];
+
+#define INIT_THREAD { \
+ .sp = (unsigned long)&__end_init_task - sizeof(struct pt_regs), \
+}
extern unsigned long KSTK_ESP(struct task_struct *task);
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index a336feef0af1..f6a1737c77be 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -59,7 +59,6 @@ extern struct real_mode_header *real_mode_header;
extern unsigned char real_mode_blob_end[];
extern unsigned long initial_code;
-extern unsigned long initial_gs;
extern unsigned long initial_stack;
#ifdef CONFIG_AMD_MEM_ENCRYPT
extern unsigned long initial_vc_handler;
diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h
index b63be696b776..0759af9b1acf 100644
--- a/arch/x86/include/asm/sev-common.h
+++ b/arch/x86/include/asm/sev-common.h
@@ -128,10 +128,6 @@ struct snp_psc_desc {
struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY];
} __packed;
-/* Guest message request error codes */
-#define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32)
-#define SNP_GUEST_REQ_ERR_BUSY BIT_ULL(33)
-
#define GHCB_MSR_TERM_REQ 0x100
#define GHCB_MSR_TERM_REASON_SET_POS 12
#define GHCB_MSR_TERM_REASON_SET_MASK 0xf
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index ebc271bb6d8e..13dc2a9d23c1 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -9,6 +9,8 @@
#define __ASM_ENCRYPTED_STATE_H
#include <linux/types.h>
+#include <linux/sev-guest.h>
+
#include <asm/insn.h>
#include <asm/sev-common.h>
#include <asm/bootparam.h>
@@ -185,6 +187,9 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate)
return rc;
}
+
+struct snp_guest_request_ioctl;
+
void setup_ghcb(void);
void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
unsigned int npages);
@@ -196,7 +201,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned int npages);
void snp_set_wakeup_secondary_cpu(void);
bool snp_init(struct boot_params *bp);
void __init __noreturn snp_abort(void);
-int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err);
+int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
@@ -216,8 +221,7 @@ static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npag
static inline void snp_set_wakeup_secondary_cpu(void) { }
static inline bool snp_init(struct boot_params *bp) { return false; }
static inline void snp_abort(void) { }
-static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input,
- unsigned long *fw_err)
+static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
{
return -ENOTTY;
}
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index 4a03993e0785..2631e01f6e0f 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -7,8 +7,6 @@
#define TDX_HYPERCALL_STANDARD 0
-#define TDX_HCALL_HAS_OUTPUT BIT(0)
-
#define TDX_CPUID_LEAF_ID 0x21
#define TDX_IDENT "IntelTDX "
@@ -36,7 +34,8 @@ struct tdx_hypercall_args {
};
/* Used to request services from the VMM */
-u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags);
+u64 __tdx_hypercall(struct tdx_hypercall_args *args);
+u64 __tdx_hypercall_ret(struct tdx_hypercall_args *args);
/* Called from __tdx_hypercall() for unrecoverable failure */
void __tdx_hypercall_failed(void);
diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
new file mode 100644
index 000000000000..42fee8959df7
--- /dev/null
+++ b/arch/x86/include/asm/shstk.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SHSTK_H
+#define _ASM_X86_SHSTK_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+struct task_struct;
+struct ksignal;
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+struct thread_shstk {
+ u64 base;
+ u64 size;
+};
+
+long shstk_prctl(struct task_struct *task, int option, unsigned long arg2);
+void reset_thread_features(void);
+unsigned long shstk_alloc_thread_stack(struct task_struct *p, unsigned long clone_flags,
+ unsigned long stack_size);
+void shstk_free(struct task_struct *p);
+int setup_signal_shadow_stack(struct ksignal *ksig);
+int restore_signal_shadow_stack(void);
+#else
+static inline long shstk_prctl(struct task_struct *task, int option,
+ unsigned long arg2) { return -EINVAL; }
+static inline void reset_thread_features(void) {}
+static inline unsigned long shstk_alloc_thread_stack(struct task_struct *p,
+ unsigned long clone_flags,
+ unsigned long stack_size) { return 0; }
+static inline void shstk_free(struct task_struct *p) {}
+static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
+static inline int restore_signal_shadow_stack(void) { return 0; }
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_SHSTK_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index b4dbb20dab1a..88f6a4724757 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -93,12 +93,13 @@ static inline void __cpu_die(unsigned int cpu)
smp_ops.cpu_die(cpu);
}
-static inline void play_dead(void)
+static inline void __noreturn play_dead(void)
{
smp_ops.play_dead();
+ BUG();
}
-static inline void smp_send_reschedule(int cpu)
+static inline void arch_smp_send_reschedule(int cpu)
{
smp_ops.smp_send_reschedule(cpu);
}
@@ -199,5 +200,8 @@ extern void nmi_selftest(void);
#define nmi_selftest() do { } while (0)
#endif
-#endif /* __ASSEMBLY__ */
+extern unsigned int smpboot_control;
+
+#endif /* !__ASSEMBLY__ */
+
#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index de48d1389936..d6cd9344f6c7 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -202,6 +202,19 @@ static inline void clwb(volatile void *__p)
: [pax] "a" (p));
}
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+static inline int write_user_shstk_64(u64 __user *addr, u64 val)
+{
+ asm_volatile_goto("1: wrussq %[val], (%[addr])\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: [addr] "r" (addr), [val] "r" (val)
+ :: fail);
+ return 0;
+fail:
+ return -EFAULT;
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
#define nop() asm volatile ("nop")
static inline void serialize(void)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cda3118f3b27..965659d2c965 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_TLBFLUSH_H
#define _ASM_X86_TLBFLUSH_H
-#include <linux/mm.h>
+#include <linux/mm_types.h>
#include <linux/sched.h>
#include <asm/processor.h>
@@ -12,6 +12,7 @@
#include <asm/invpcid.h>
#include <asm/pti.h>
#include <asm/processor-flags.h>
+#include <asm/pgtable.h>
void __flush_tlb_all(void);
@@ -53,6 +54,15 @@ static inline void cr4_clear_bits(unsigned long mask)
local_irq_restore(flags);
}
+#ifdef CONFIG_ADDRESS_MASKING
+DECLARE_PER_CPU(u64, tlbstate_untag_mask);
+
+static inline u64 current_untag_mask(void)
+{
+ return this_cpu_read(tlbstate_untag_mask);
+}
+#endif
+
#ifndef MODULE
/*
* 6 because 6 should be plenty and struct tlb_state will fit in two cache
@@ -101,6 +111,16 @@ struct tlb_state {
*/
bool invalidate_other;
+#ifdef CONFIG_ADDRESS_MASKING
+ /*
+ * Active LAM mode.
+ *
+ * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM
+ * disabled.
+ */
+ u8 lam;
+#endif
+
/*
* Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
* the corresponding user PCID needs a flush next time we
@@ -273,7 +293,8 @@ static inline bool pte_flags_need_flush(unsigned long oldflags,
const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT |
_PAGE_ACCESSED;
const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 |
- _PAGE_SOFTW3 | _PAGE_SOFTW4;
+ _PAGE_SOFTW3 | _PAGE_SOFTW4 |
+ _PAGE_SAVED_DIRTY;
const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT |
_PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT |
_PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 |
@@ -357,6 +378,32 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
}
#define huge_pmd_needs_flush huge_pmd_needs_flush
+#ifdef CONFIG_ADDRESS_MASKING
+static inline u64 tlbstate_lam_cr3_mask(void)
+{
+ u64 lam = this_cpu_read(cpu_tlbstate.lam);
+
+ return lam << X86_CR3_LAM_U57_BIT;
+}
+
+static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+{
+ this_cpu_write(cpu_tlbstate.lam,
+ mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT);
+ this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask);
+}
+
+#else
+
+static inline u64 tlbstate_lam_cr3_mask(void)
+{
+ return 0;
+}
+
+static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
+{
+}
+#endif
#endif /* !MODULE */
static inline void __native_tlb_flush_global(unsigned long cr4)
diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
index 10b1de500ab1..afa524325e55 100644
--- a/arch/x86/include/asm/trap_pf.h
+++ b/arch/x86/include/asm/trap_pf.h
@@ -11,6 +11,7 @@
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
+ * bit 6 == 1: shadow stack access fault
* bit 15 == 1: SGX MMU page-fault
*/
enum x86_pf_error_code {
@@ -20,6 +21,7 @@ enum x86_pf_error_code {
X86_PF_RSVD = 1 << 3,
X86_PF_INSTR = 1 << 4,
X86_PF_PK = 1 << 5,
+ X86_PF_SHSTK = 1 << 6,
X86_PF_SGX = 1 << 15,
};
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 47ecfff2c83d..75e0dabf0c45 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -47,4 +47,16 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs,
struct stack_info *info);
#endif
+static inline void cond_local_irq_enable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_enable();
+}
+
+static inline void cond_local_irq_disable(struct pt_regs *regs)
+{
+ if (regs->flags & X86_EFLAGS_IF)
+ local_irq_disable();
+}
+
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 1cc756eafa44..457e814712af 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -7,11 +7,14 @@
#include <linux/compiler.h>
#include <linux/instrumented.h>
#include <linux/kasan-checks.h>
+#include <linux/mm_types.h>
#include <linux/string.h>
+#include <linux/mmap_lock.h>
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
#include <asm/extable.h>
+#include <asm/tlbflush.h>
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline bool pagefault_disabled(void);
@@ -21,6 +24,57 @@ static inline bool pagefault_disabled(void);
# define WARN_ON_IN_IRQ()
#endif
+#ifdef CONFIG_ADDRESS_MASKING
+/*
+ * Mask out tag bits from the address.
+ *
+ * Magic with the 'sign' allows to untag userspace pointer without any branches
+ * while leaving kernel addresses intact.
+ */
+static inline unsigned long __untagged_addr(unsigned long addr)
+{
+ long sign;
+
+ /*
+ * Refer tlbstate_untag_mask directly to avoid RIP-relative relocation
+ * in alternative instructions. The relocation gets wrong when gets
+ * copied to the target place.
+ */
+ asm (ALTERNATIVE("",
+ "sar $63, %[sign]\n\t" /* user_ptr ? 0 : -1UL */
+ "or %%gs:tlbstate_untag_mask, %[sign]\n\t"
+ "and %[sign], %[addr]\n\t", X86_FEATURE_LAM)
+ : [addr] "+r" (addr), [sign] "=r" (sign)
+ : "m" (tlbstate_untag_mask), "[sign]" (addr));
+
+ return addr;
+}
+
+#define untagged_addr(addr) ({ \
+ unsigned long __addr = (__force unsigned long)(addr); \
+ (__force __typeof__(addr))__untagged_addr(__addr); \
+})
+
+static inline unsigned long __untagged_addr_remote(struct mm_struct *mm,
+ unsigned long addr)
+{
+ long sign = addr >> 63;
+
+ mmap_assert_locked(mm);
+ addr &= (mm)->context.untag_mask | sign;
+
+ return addr;
+}
+
+#define untagged_addr_remote(mm, addr) ({ \
+ unsigned long __addr = (__force unsigned long)(addr); \
+ (__force __typeof__(addr))__untagged_addr_remote(mm, __addr); \
+})
+
+#else
+#define untagged_addr(addr) (addr)
+#endif
+
/**
* access_ok - Checks if a user space pointer is valid
* @addr: User space pointer to start of block to check
@@ -38,10 +92,10 @@ static inline bool pagefault_disabled(void);
* Return: true (nonzero) if the memory block may be valid, false (zero)
* if it is definitely invalid.
*/
-#define access_ok(addr, size) \
+#define access_ok(addr, size) \
({ \
WARN_ON_IN_IRQ(); \
- likely(__access_ok(addr, size)); \
+ likely(__access_ok(untagged_addr(addr), size)); \
})
#include <asm-generic/access_ok.h>
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index d13d71af5cf6..c6b1dcded364 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -62,8 +62,6 @@ extern long __copy_user_nocache(void *dst, const void __user *src,
unsigned size, int zerorest);
extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size);
-extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
- size_t len);
static inline int
__copy_from_user_inatomic_nocache(void *dst, const void __user *src,
diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h
index e7c71750b309..01cb9692b160 100644
--- a/arch/x86/include/asm/unwind_hints.h
+++ b/arch/x86/include/asm/unwind_hints.h
@@ -7,12 +7,17 @@
#ifdef __ASSEMBLY__
-.macro UNWIND_HINT_EMPTY
- UNWIND_HINT type=UNWIND_HINT_TYPE_CALL end=1
+.macro UNWIND_HINT_END_OF_STACK
+ UNWIND_HINT type=UNWIND_HINT_TYPE_END_OF_STACK
+.endm
+
+.macro UNWIND_HINT_UNDEFINED
+ UNWIND_HINT type=UNWIND_HINT_TYPE_UNDEFINED
.endm
.macro UNWIND_HINT_ENTRY
- UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1
+ VALIDATE_UNRET_BEGIN
+ UNWIND_HINT_END_OF_STACK
.endm
.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 signal=1
@@ -52,6 +57,11 @@
UNWIND_HINT_REGS base=\base offset=\offset partial=1 signal=\signal
.endm
+.macro UNWIND_HINT_IRET_ENTRY base=%rsp offset=0 signal=1
+ VALIDATE_UNRET_BEGIN
+ UNWIND_HINT_IRET_REGS base=\base offset=\offset signal=\signal
+.endm
+
.macro UNWIND_HINT_FUNC
UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC
.endm
@@ -67,7 +77,7 @@
#else
#define UNWIND_HINT_FUNC \
- UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0, 0)
+ UNWIND_HINT(UNWIND_HINT_TYPE_FUNC, ORC_REG_SP, 8, 0)
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index c1c8c581759d..acc20ae4079d 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -259,11 +259,15 @@ struct x86_legacy_features {
* VMMCALL under SEV-ES. Needs to return 'false'
* if the checks fail. Called from the #VC
* exception handler.
+ * @is_private_mmio: For CoCo VMs, must map MMIO address as private.
+ * Used when device is emulated by a paravisor
+ * layer in the VM context.
*/
struct x86_hyper_runtime {
void (*pin_vcpu)(int cpu);
void (*sev_es_hcall_prepare)(struct ghcb *ghcb, struct pt_regs *regs);
bool (*sev_es_hcall_finish)(struct ghcb *ghcb, struct pt_regs *regs);
+ bool (*is_private_mmio)(u64 addr);
};
/**
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 775dbd3aff73..8148bdddbd2c 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -3,6 +3,7 @@
#define _ASM_X86_MMAN_H
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
+#define MAP_ABOVE4G 0x80 /* only map above 4GB */
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
#define arch_calc_vm_prot_bits(prot, key) ( \
@@ -12,6 +13,9 @@
((key) & 0x8 ? VM_PKEY_BIT3 : 0))
#endif
+/* Flags for map_shadow_stack(2) */
+#define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */
+
#include <asm-generic/mman.h>
#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 500b96e71f18..384e2cc6ac19 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -16,8 +16,28 @@
#define ARCH_GET_XCOMP_GUEST_PERM 0x1024
#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025
+#define ARCH_XCOMP_TILECFG 17
+#define ARCH_XCOMP_TILEDATA 18
+
#define ARCH_MAP_VDSO_X32 0x2001
#define ARCH_MAP_VDSO_32 0x2002
#define ARCH_MAP_VDSO_64 0x2003
+/* Don't use 0x3001-0x3004 because of old glibcs */
+
+#define ARCH_GET_UNTAG_MASK 0x4001
+#define ARCH_ENABLE_TAGGED_ADDR 0x4002
+#define ARCH_GET_MAX_TAG_BITS 0x4003
+#define ARCH_FORCE_TAGGED_SVA 0x4004
+
+#define ARCH_SHSTK_ENABLE 0x5001
+#define ARCH_SHSTK_DISABLE 0x5002
+#define ARCH_SHSTK_LOCK 0x5003
+#define ARCH_SHSTK_UNLOCK 0x5004
+#define ARCH_SHSTK_STATUS 0x5005
+
+/* ARCH_SHSTK_ features bits */
+#define ARCH_SHSTK_SHSTK (1ULL << 0)
+#define ARCH_SHSTK_WRSS (1ULL << 1)
+
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index c47cc7f2feeb..d898432947ff 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -82,6 +82,10 @@
#define X86_CR3_PCID_BITS 12
#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
+#define X86_CR3_LAM_U57_BIT 61 /* Activate LAM for userspace, 62:57 bits masked */
+#define X86_CR3_LAM_U57 _BITULL(X86_CR3_LAM_U57_BIT)
+#define X86_CR3_LAM_U48_BIT 62 /* Activate LAM for userspace, 62:48 bits masked */
+#define X86_CR3_LAM_U48 _BITULL(X86_CR3_LAM_U48_BIT)
#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
@@ -132,6 +136,8 @@
#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
#define X86_CR4_CET_BIT 23 /* enable Control-flow Enforcement Technology */
#define X86_CR4_CET _BITUL(X86_CR4_CET_BIT)
+#define X86_CR4_LAM_SUP_BIT 28 /* LAM for supervisor pointers */
+#define X86_CR4_LAM_SUP _BITUL(X86_CR4_LAM_SUP_BIT)
/*
* x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index dd61752f4c96..b366641703e3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -144,6 +144,10 @@ obj-$(CONFIG_CFI_CLANG) += cfi.o
obj-$(CONFIG_CALL_THUNKS) += callthunks.o
+obj-$(CONFIG_X86_CET) += cet.o
+
+obj-$(CONFIG_X86_USER_SHADOW_STACK) += shstk.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 0dac4ab5b55b..21b542a6866c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1858,13 +1858,18 @@ early_param("acpi_sci", setup_acpi_sci);
int __acpi_acquire_global_lock(unsigned int *lock)
{
- unsigned int old, new;
+ unsigned int old, new, val;
old = READ_ONCE(*lock);
do {
- new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
+ val = (old >> 1) & 0x1;
+ new = (old & ~0x3) + 2 + val;
} while (!try_cmpxchg(lock, &old, new));
- return ((new & 0x3) < 3) ? -1 : 0;
+
+ if (val)
+ return 0;
+
+ return -1;
}
int __acpi_release_global_lock(unsigned int *lock)
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 3b7f4cdbf2e0..1328c221af30 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -111,13 +111,26 @@ int x86_acpi_suspend_lowlevel(void)
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
#ifdef CONFIG_SMP
- initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
- early_gdt_descr.address =
- (unsigned long)get_cpu_gdt_rw(smp_processor_id());
- initial_gs = per_cpu_offset(smp_processor_id());
+ /*
+ * As each CPU starts up, it will find its own stack pointer
+ * from its current_task->thread.sp. Typically that will be
+ * the idle thread for a newly-started AP, or even the boot
+ * CPU which will find it set to &init_task in the static
+ * per-cpu data.
+ *
+ * Make the resuming CPU use the temporary stack at startup
+ * by setting current->thread.sp to point to that. The true
+ * %rsp will be restored with the rest of the CPU context,
+ * by do_suspend_lowlevel(). And unwinders don't care about
+ * the abuse of ->thread.sp because it's a dead variable
+ * while the thread is running on the CPU anyway; the true
+ * value is in the actual %rsp register.
+ */
+ current->thread.sp = (unsigned long)temp_stack + sizeof(temp_stack);
+ smpboot_control = smp_processor_id();
#endif
initial_code = (unsigned long)wakeup_long64;
- saved_magic = 0x123456789abcdef0L;
+ saved_magic = 0x123456789abcdef0L;
#endif /* CONFIG_64BIT */
/*
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 20d9a604da7c..770557110051 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -422,10 +422,9 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
if (vector && !eilvt_entry_is_changeable(vector, new))
/* may not change if vectors are different */
return rsvd;
- rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
- } while (rsvd != new);
+ } while (!atomic_try_cmpxchg(&eilvt_offsets[offset], &rsvd, new));
- rsvd &= ~APIC_EILVT_MASKED;
+ rsvd = new & ~APIC_EILVT_MASKED;
if (rsvd && rsvd != vector)
pr_info("LVT offset %d assigned for vector 0x%02x\n",
offset, rsvd);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1f83b052bb74..146671de9ddc 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -66,6 +66,7 @@
#include <asm/hw_irq.h>
#include <asm/apic.h>
#include <asm/pgtable.h>
+#include <asm/x86_init.h>
#define for_each_ioapic(idx) \
for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
@@ -2680,10 +2681,15 @@ static void io_apic_set_fixmap(enum fixed_addresses idx, phys_addr_t phys)
pgprot_t flags = FIXMAP_PAGE_NOCACHE;
/*
- * Ensure fixmaps for IOAPIC MMIO respect memory encryption pgprot
+ * Ensure fixmaps for IO-APIC MMIO respect memory encryption pgprot
* bits, just like normal ioremap():
*/
- flags = pgprot_decrypted(flags);
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
+ if (x86_platform.hyper.is_private_mmio(phys))
+ flags = pgprot_encrypted(flags);
+ else
+ flags = pgprot_decrypted(flags);
+ }
__set_fixmap(idx, phys, flags);
}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e696e22d0531..b2b2b7f3e03f 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -9,11 +9,7 @@
#include "local.h"
-struct cluster_mask {
- unsigned int clusterid;
- int node;
- struct cpumask mask;
-};
+#define apic_cluster(apicid) ((apicid) >> 4)
/*
* __x2apic_send_IPI_mask() possibly needs to read
@@ -23,8 +19,7 @@ struct cluster_mask {
static u32 *x86_cpu_to_logical_apicid __read_mostly;
static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
-static struct cluster_mask *cluster_hotplug_mask;
+static DEFINE_PER_CPU_READ_MOSTLY(struct cpumask *, cluster_masks);
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
@@ -60,10 +55,10 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
/* Collapse cpus in a cluster so a single IPI per cluster is sent */
for_each_cpu(cpu, tmpmsk) {
- struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu);
+ struct cpumask *cmsk = per_cpu(cluster_masks, cpu);
dest = 0;
- for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
+ for_each_cpu_and(clustercpu, tmpmsk, cmsk)
dest |= x86_cpu_to_logical_apicid[clustercpu];
if (!dest)
@@ -71,7 +66,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
/* Remove cluster CPUs from tmpmask */
- cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
+ cpumask_andnot(tmpmsk, tmpmsk, cmsk);
}
local_irq_restore(flags);
@@ -105,55 +100,98 @@ static u32 x2apic_calc_apicid(unsigned int cpu)
static void init_x2apic_ldr(void)
{
- struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
- u32 cluster, apicid = apic_read(APIC_LDR);
- unsigned int cpu;
+ struct cpumask *cmsk = this_cpu_read(cluster_masks);
- x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;
+ BUG_ON(!cmsk);
- if (cmsk)
- goto update;
-
- cluster = apicid >> 16;
- for_each_online_cpu(cpu) {
- cmsk = per_cpu(cluster_masks, cpu);
- /* Matching cluster found. Link and update it. */
- if (cmsk && cmsk->clusterid == cluster)
- goto update;
+ cpumask_set_cpu(smp_processor_id(), cmsk);
+}
+
+/*
+ * As an optimisation during boot, set the cluster_mask for all present
+ * CPUs at once, to prevent each of them having to iterate over the others
+ * to find the existing cluster_mask.
+ */
+static void prefill_clustermask(struct cpumask *cmsk, unsigned int cpu, u32 cluster)
+{
+ int cpu_i;
+
+ for_each_present_cpu(cpu_i) {
+ struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu_i);
+ u32 apicid = apic->cpu_present_to_apicid(cpu_i);
+
+ if (apicid == BAD_APICID || cpu_i == cpu || apic_cluster(apicid) != cluster)
+ continue;
+
+ if (WARN_ON_ONCE(*cpu_cmsk == cmsk))
+ continue;
+
+ BUG_ON(*cpu_cmsk);
+ *cpu_cmsk = cmsk;
}
- cmsk = cluster_hotplug_mask;
- cmsk->clusterid = cluster;
- cluster_hotplug_mask = NULL;
-update:
- this_cpu_write(cluster_masks, cmsk);
- cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
}
-static int alloc_clustermask(unsigned int cpu, int node)
+static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
{
+ struct cpumask *cmsk = NULL;
+ unsigned int cpu_i;
+
+ /*
+ * At boot time, the CPU present mask is stable. The cluster mask is
+ * allocated for the first CPU in the cluster and propagated to all
+ * present siblings in the cluster. If the cluster mask is already set
+ * on entry to this function for a given CPU, there is nothing to do.
+ */
if (per_cpu(cluster_masks, cpu))
return 0;
+
+ if (system_state < SYSTEM_RUNNING)
+ goto alloc;
+
/*
- * If a hotplug spare mask exists, check whether it's on the right
- * node. If not, free it and allocate a new one.
+ * On post boot hotplug for a CPU which was not present at boot time,
+ * iterate over all possible CPUs (even those which are not present
+ * any more) to find any existing cluster mask.
*/
- if (cluster_hotplug_mask) {
- if (cluster_hotplug_mask->node == node)
- return 0;
- kfree(cluster_hotplug_mask);
+ for_each_possible_cpu(cpu_i) {
+ u32 apicid = apic->cpu_present_to_apicid(cpu_i);
+
+ if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) {
+ cmsk = per_cpu(cluster_masks, cpu_i);
+ /*
+ * If the cluster is already initialized, just store
+ * the mask and return. There's no need to propagate.
+ */
+ if (cmsk) {
+ per_cpu(cluster_masks, cpu) = cmsk;
+ return 0;
+ }
+ }
}
-
- cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
- GFP_KERNEL, node);
- if (!cluster_hotplug_mask)
+ /*
+ * No CPU in the cluster has ever been initialized, so fall through to
+ * the boot time code which will also populate the cluster mask for any
+ * other CPU in the cluster which is (now) present.
+ */
+alloc:
+ cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
+ if (!cmsk)
return -ENOMEM;
- cluster_hotplug_mask->node = node;
+ per_cpu(cluster_masks, cpu) = cmsk;
+ prefill_clustermask(cmsk, cpu, cluster);
+
return 0;
}
static int x2apic_prepare_cpu(unsigned int cpu)
{
- if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
+ u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
+ u32 cluster = apic_cluster(phys_apicid);
+ u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+
+ x86_cpu_to_logical_apicid[cpu] = logical_apicid;
+
+ if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0)
return -ENOMEM;
if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
return -ENOMEM;
@@ -162,10 +200,10 @@ static int x2apic_prepare_cpu(unsigned int cpu)
static int x2apic_dead_cpu(unsigned int dead_cpu)
{
- struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu);
+ struct cpumask *cmsk = per_cpu(cluster_masks, dead_cpu);
if (cmsk)
- cpumask_clear_cpu(dead_cpu, &cmsk->mask);
+ cpumask_clear_cpu(dead_cpu, cmsk);
free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
return 0;
}
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 283dcd2f62c8..dc3576303f1a 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -115,6 +115,7 @@ static void __used common(void)
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
+ OFFSET(X86_current_task, pcpu_hot, current_task);
#ifdef CONFIG_CALL_DEPTH_TRACKING
OFFSET(X86_call_depth, pcpu_hot, call_depth);
#endif
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
new file mode 100644
index 000000000000..cc10d8be9d74
--- /dev/null
+++ b/arch/x86/kernel/cet.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ptrace.h>
+#include <asm/bugs.h>
+#include <asm/traps.h>
+
+enum cp_error_code {
+ CP_EC = (1 << 15) - 1,
+
+ CP_RET = 1,
+ CP_IRET = 2,
+ CP_ENDBR = 3,
+ CP_RSTRORSSP = 4,
+ CP_SETSSBSY = 5,
+
+ CP_ENCL = 1 << 15,
+};
+
+static const char cp_err[][10] = {
+ [0] = "unknown",
+ [1] = "near ret",
+ [2] = "far/iret",
+ [3] = "endbranch",
+ [4] = "rstorssp",
+ [5] = "setssbsy",
+};
+
+static const char *cp_err_string(unsigned long error_code)
+{
+ unsigned int cpec = error_code & CP_EC;
+
+ if (cpec >= ARRAY_SIZE(cp_err))
+ cpec = 0;
+ return cp_err[cpec];
+}
+
+static void do_unexpected_cp(struct pt_regs *regs, unsigned long error_code)
+{
+ WARN_ONCE(1, "Unexpected %s #CP, error_code: %s\n",
+ user_mode(regs) ? "user mode" : "kernel mode",
+ cp_err_string(error_code));
+}
+
+static DEFINE_RATELIMIT_STATE(cpf_rate, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ struct task_struct *tsk;
+ unsigned long ssp;
+
+ /*
+ * An exception was just taken from userspace. Since interrupts are disabled
+ * here, no scheduling should have messed with the registers yet and they
+ * will be whatever is live in userspace. So read the SSP before enabling
+ * interrupts so locking the fpregs to do it later is not required.
+ */
+ rdmsrl(MSR_IA32_PL3_SSP, ssp);
+
+ cond_local_irq_enable(regs);
+
+ tsk = current;
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_CP;
+
+ /* Ratelimit to prevent log spamming. */
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ __ratelimit(&cpf_rate)) {
+ pr_emerg("%s[%d] control protection ip:%lx sp:%lx ssp:%lx error:%lx(%s)%s",
+ tsk->comm, task_pid_nr(tsk),
+ regs->ip, regs->sp, ssp, error_code,
+ cp_err_string(error_code),
+ error_code & CP_ENCL ? " in enclave" : "");
+ print_vma_addr(KERN_CONT " in ", regs->ip);
+ pr_cont("\n");
+ }
+
+ force_sig_fault(SIGSEGV, SEGV_CPERR, (void __user *)0);
+ cond_local_irq_disable(regs);
+}
+
+static __ro_after_init bool ibt_fatal = true;
+
+/* code label defined in asm below */
+extern void ibt_selftest_ip(void);
+
+static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ if ((error_code & CP_EC) != CP_ENDBR) {
+ do_unexpected_cp(regs, error_code);
+ return;
+ }
+
+ if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) {
+ regs->ax = 0;
+ return;
+ }
+
+ pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs));
+ if (!ibt_fatal) {
+ printk(KERN_DEFAULT CUT_HERE);
+ __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL);
+ return;
+ }
+ BUG();
+}
+
+/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */
+noinline bool ibt_selftest(void)
+{
+ unsigned long ret;
+
+ asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t"
+ ANNOTATE_RETPOLINE_SAFE
+ " jmp *%%rax\n\t"
+ "ibt_selftest_ip:\n\t"
+ UNWIND_HINT_FUNC
+ ANNOTATE_NOENDBR
+ " nop\n\t"
+
+ : "=a" (ret) : : "memory");
+
+ return !ret;
+}
+
+static int __init ibt_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ setup_clear_cpu_cap(X86_FEATURE_IBT);
+
+ if (!strcmp(str, "warn"))
+ ibt_fatal = false;
+
+ return 1;
+}
+
+__setup("ibt=", ibt_setup);
+
+DEFINE_IDTENTRY_ERRORCODE(exc_control_protection)
+{
+ if (user_mode(regs)) {
+ if (cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ do_user_cp_fault(regs, error_code);
+ else
+ do_unexpected_cp(regs, error_code);
+ } else {
+ if (cpu_feature_enabled(X86_FEATURE_IBT))
+ do_kernel_cp_fault(regs, error_code);
+ else
+ do_unexpected_cp(regs, error_code);
+ }
+}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 95cdd08c4cbb..28ff91a8d9ca 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1005,6 +1005,17 @@ static void init_amd(struct cpuinfo_x86 *c)
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
check_null_seg_clears_base(c);
+
+ /*
+ * Make sure EFER[AIBRSE - Automatic IBRS Enable] is set. The APs are brought up
+ * using the trampoline code and as part of it, MSR_EFER gets prepared there in
+ * order to be replicated onto them. Regardless, set it here again, if not set,
+ * to protect against any future refactoring/code reorganization which might
+ * miss setting this important bit.
+ */
+ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
+ cpu_has(c, X86_FEATURE_AUTOIBRS))
+ WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS));
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index f9d060e71c3e..182af64387d0 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -784,8 +784,7 @@ static int __init nospectre_v1_cmdline(char *str)
}
early_param("nospectre_v1", nospectre_v1_cmdline);
-static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
- SPECTRE_V2_NONE;
+enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE;
#undef pr_fmt
#define pr_fmt(fmt) "RETBleed: " fmt
@@ -1133,13 +1132,6 @@ spectre_v2_parse_user_cmdline(void)
return SPECTRE_V2_USER_CMD_AUTO;
}
-static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
-{
- return mode == SPECTRE_V2_EIBRS ||
- mode == SPECTRE_V2_EIBRS_RETPOLINE ||
- mode == SPECTRE_V2_EIBRS_LFENCE;
-}
-
static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
{
return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8cd4126d8253..3ea06b0b4570 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -121,6 +121,7 @@ static const struct x86_cpu_id ppin_cpuids[] = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
@@ -600,27 +601,43 @@ __noendbr void ibt_restore(u64 save)
static __always_inline void setup_cet(struct cpuinfo_x86 *c)
{
- u64 msr = CET_ENDBR_EN;
+ bool user_shstk, kernel_ibt;
- if (!HAS_KERNEL_IBT ||
- !cpu_feature_enabled(X86_FEATURE_IBT))
+ if (!IS_ENABLED(CONFIG_X86_CET))
return;
- wrmsrl(MSR_IA32_S_CET, msr);
+ kernel_ibt = HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT);
+ user_shstk = cpu_feature_enabled(X86_FEATURE_SHSTK) &&
+ IS_ENABLED(CONFIG_X86_USER_SHADOW_STACK);
+
+ if (!kernel_ibt && !user_shstk)
+ return;
+
+ if (user_shstk)
+ set_cpu_cap(c, X86_FEATURE_USER_SHSTK);
+
+ if (kernel_ibt)
+ wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN);
+ else
+ wrmsrl(MSR_IA32_S_CET, 0);
+
cr4_set_bits(X86_CR4_CET);
- if (!ibt_selftest()) {
+ if (kernel_ibt && !ibt_selftest()) {
pr_err("IBT selftest: Failed!\n");
wrmsrl(MSR_IA32_S_CET, 0);
setup_clear_cpu_cap(X86_FEATURE_IBT);
- return;
}
}
__noendbr void cet_disable(void)
{
- if (cpu_feature_enabled(X86_FEATURE_IBT))
- wrmsrl(MSR_IA32_S_CET, 0);
+ if (!(cpu_feature_enabled(X86_FEATURE_IBT) ||
+ cpu_feature_enabled(X86_FEATURE_SHSTK)))
+ return;
+
+ wrmsrl(MSR_IA32_S_CET, 0);
+ wrmsrl(MSR_IA32_U_CET, 0);
}
/*
@@ -1482,6 +1499,9 @@ static void __init cpu_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+ if (cmdline_find_option_bool(boot_command_line, "nousershstk"))
+ setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK);
+
arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
if (arglen <= 0)
return;
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 57a5349e6954..f97b0fe13da8 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -83,4 +83,12 @@ unsigned int aperfmperf_get_khz(int cpu);
extern void x86_spec_ctrl_setup_ap(void);
extern void update_srbds_msr(void);
+extern enum spectre_v2_mitigation spectre_v2_enabled;
+
+static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
+{
+ return mode == SPECTRE_V2_EIBRS ||
+ mode == SPECTRE_V2_EIBRS_RETPOLINE ||
+ mode == SPECTRE_V2_EIBRS_LFENCE;
+}
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index f6748c8bd647..e462c1d3800a 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -81,6 +81,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_XFD, X86_FEATURE_XSAVES },
{ X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
{ X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
+ { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
{}
};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 291d4167fab8..1c648b09e053 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1451,31 +1451,13 @@ void handle_bus_lock(struct pt_regs *regs)
}
/*
- * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should
- * only be trusted if it is confirmed that a CPU model implements a
- * specific feature at a particular bit position.
- *
- * The possible driver data field values:
- *
- * - 0: CPU models that are known to have the per-core split-lock detection
- * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
- *
- * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use
- * bit 5 to enumerate the per-core split-lock detection feature.
+ * CPU models that are known to have the per-core split-lock detection
+ * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
*/
static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1),
- X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1),
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0),
{}
};
@@ -1487,24 +1469,27 @@ static void __init split_lock_setup(struct cpuinfo_x86 *c)
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return;
+ /* Check for CPUs that have support but do not enumerate it: */
m = x86_match_cpu(split_lock_cpu_ids);
- if (!m)
- return;
+ if (m)
+ goto supported;
- switch (m->driver_data) {
- case 0:
- break;
- case 1:
- if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
- return;
- rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
- if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT))
- return;
- break;
- default:
+ if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
return;
- }
+ /*
+ * Not all bits in MSR_IA32_CORE_CAPS are architectural, but
+ * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set
+ * it have split lock detection.
+ */
+ rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
+ if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
+ goto supported;
+
+ /* CPU is not in the model list and does not have the MSR bit: */
+ return;
+
+supported:
cpu_model_supports_sld = true;
__split_lock_setup();
}
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 23c5072fbbb7..0b971f974096 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -235,10 +235,10 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
* A list of the banks enabled on each logical CPU. Controls which respective
* descriptors to initialize later in mce_threshold_create_device().
*/
-static DEFINE_PER_CPU(unsigned int, bank_map);
+static DEFINE_PER_CPU(u64, bank_map);
/* Map of banks that have more than MCA_MISC0 available. */
-static DEFINE_PER_CPU(u32, smca_misc_banks_map);
+static DEFINE_PER_CPU(u64, smca_misc_banks_map);
static void amd_threshold_interrupt(void);
static void amd_deferred_error_interrupt(void);
@@ -267,7 +267,7 @@ static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu)
return;
if (low & MASK_BLKPTR_LO)
- per_cpu(smca_misc_banks_map, cpu) |= BIT(bank);
+ per_cpu(smca_misc_banks_map, cpu) |= BIT_ULL(bank);
}
@@ -530,7 +530,7 @@ static u32 smca_get_block_address(unsigned int bank, unsigned int block,
if (!block)
return MSR_AMD64_SMCA_MCx_MISC(bank);
- if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank)))
+ if (!(per_cpu(smca_misc_banks_map, cpu) & BIT_ULL(bank)))
return 0;
return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
@@ -574,7 +574,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int new;
if (!block)
- per_cpu(bank_map, cpu) |= (1 << bank);
+ per_cpu(bank_map, cpu) |= BIT_ULL(bank);
memset(&b, 0, sizeof(b));
b.cpu = cpu;
@@ -878,7 +878,7 @@ static void amd_threshold_interrupt(void)
return;
for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ if (!(per_cpu(bank_map, cpu) & BIT_ULL(bank)))
continue;
first_block = bp[bank]->blocks;
@@ -1029,7 +1029,7 @@ static const struct sysfs_ops threshold_ops = {
static void threshold_block_release(struct kobject *kobj);
-static struct kobj_type threshold_ktype = {
+static const struct kobj_type threshold_ktype = {
.sysfs_ops = &threshold_ops,
.default_groups = default_groups,
.release = threshold_block_release,
@@ -1356,7 +1356,7 @@ int mce_threshold_create_device(unsigned int cpu)
return -ENOMEM;
for (bank = 0; bank < numbanks; ++bank) {
- if (!(this_cpu_read(bank_map) & (1 << bank)))
+ if (!(this_cpu_read(bank_map) & BIT_ULL(bank)))
continue;
err = threshold_create_bank(bp, cpu, bank);
if (err) {
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 91a415553c27..d2412ce2d312 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -244,11 +244,11 @@ noinstr void pentium_machine_check(struct pt_regs *regs);
noinstr void winchip_machine_check(struct pt_regs *regs);
static inline void enable_p5_mce(void) { mce_p5_enabled = 1; }
#else
-static inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
-static inline void enable_p5_mce(void) {}
-static inline void pentium_machine_check(struct pt_regs *regs) {}
-static inline void winchip_machine_check(struct pt_regs *regs) {}
+static __always_inline void intel_p5_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
+static __always_inline void enable_p5_mce(void) {}
+static __always_inline void pentium_machine_check(struct pt_regs *regs) {}
+static __always_inline void winchip_machine_check(struct pt_regs *regs) {}
#endif
noinstr u64 mce_rdmsrl(u32 msr);
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index f5fdeb1e3606..dd33ee872339 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -596,11 +596,6 @@ void reload_ucode_amd(unsigned int cpu)
}
}
}
-static u16 __find_equiv_id(unsigned int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- return find_equiv_id(&equiv_table, uci->cpu_sig.sig);
-}
/*
* a small, trivial cache of per-family ucode patches
@@ -651,9 +646,11 @@ static void free_cache(void)
static struct ucode_patch *find_patch(unsigned int cpu)
{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
u16 equiv_id;
- equiv_id = __find_equiv_id(cpu);
+
+ equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
if (!equiv_id)
return NULL;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 7a329e561354..779f70547fb7 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -498,7 +498,8 @@ put:
if (ret == 0)
ret = size;
- add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
return ret;
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f1197366a97d..315fc358e584 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -33,7 +33,6 @@
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
#include <asm/numa.h>
-#include <asm/coco.h>
/* Is Linux running as the root partition? */
bool hv_root_partition;
@@ -401,8 +400,10 @@ static void __init ms_hyperv_init_platform(void)
if (ms_hyperv.priv_high & HV_ISOLATION) {
ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
- ms_hyperv.shared_gpa_boundary =
- BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
+
+ if (ms_hyperv.shared_gpa_boundary_active)
+ ms_hyperv.shared_gpa_boundary =
+ BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
@@ -413,11 +414,6 @@ static void __init ms_hyperv_init_platform(void)
swiotlb_unencrypted_base = ms_hyperv.shared_gpa_boundary;
#endif
}
- /* Isolation VMs are unenlightened SEV-based VMs, thus this check: */
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
- if (hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE)
- cc_set_vendor(CC_VENDOR_HYPERV);
- }
}
if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
@@ -486,6 +482,9 @@ static void __init ms_hyperv_init_platform(void)
i8253_clear_counter_on_shutdown = false;
#if IS_ENABLED(CONFIG_HYPERV)
+ if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
+ (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP))
+ hv_vtom_init();
/*
* Setup the hook to get control post apic initialization.
*/
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 099b6f0d96bd..31c0e68f6227 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -4,6 +4,8 @@
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
+#include <asm/prctl.h>
+#include <linux/proc_fs.h>
#include "cpu.h"
@@ -175,3 +177,24 @@ const struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
+
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+static void dump_x86_features(struct seq_file *m, unsigned long features)
+{
+ if (features & ARCH_SHSTK_SHSTK)
+ seq_puts(m, "shstk ");
+ if (features & ARCH_SHSTK_WRSS)
+ seq_puts(m, "wrss ");
+}
+
+void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task)
+{
+ seq_puts(m, "x86_Thread_features:\t");
+ dump_x86_features(m, task->thread.features);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "x86_Thread_features_locked:\t");
+ dump_x86_features(m, task->thread.features_locked);
+ seq_putc(m, '\n');
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 0e7b6afe2fa6..ded1fc7cb7cb 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -383,41 +383,36 @@ void free_rmid(u32 rmid)
list_add_tail(&entry->list, &rmid_free_lru);
}
+static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid,
+ enum resctrl_event_id evtid)
+{
+ switch (evtid) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return &d->mbm_total[rmid];
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return &d->mbm_local[rmid];
+ default:
+ return NULL;
+ }
+}
+
static int __mon_event_count(u32 rmid, struct rmid_read *rr)
{
struct mbm_state *m;
u64 tval = 0;
- if (rr->first)
+ if (rr->first) {
resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
+ m = get_mbm_state(rr->d, rmid, rr->evtid);
+ if (m)
+ memset(m, 0, sizeof(struct mbm_state));
+ return 0;
+ }
rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval);
if (rr->err)
return rr->err;
- switch (rr->evtid) {
- case QOS_L3_OCCUP_EVENT_ID:
- rr->val += tval;
- return 0;
- case QOS_L3_MBM_TOTAL_EVENT_ID:
- m = &rr->d->mbm_total[rmid];
- break;
- case QOS_L3_MBM_LOCAL_EVENT_ID:
- m = &rr->d->mbm_local[rmid];
- break;
- default:
- /*
- * Code would never reach here because an invalid
- * event id would fail in resctrl_arch_rmid_read().
- */
- return -EINVAL;
- }
-
- if (rr->first) {
- memset(m, 0, sizeof(struct mbm_state));
- return 0;
- }
-
rr->val += tval;
return 0;
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index caf33486dc5e..aa4856b236b8 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -552,8 +552,36 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu)
}
}
+/* A passed ssp of zero will not cause any update */
+static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp)
+{
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ struct cet_user_state *xstate;
+
+ /* If ssp update is not needed. */
+ if (!ssp)
+ return 0;
+
+ xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave,
+ XFEATURE_CET_USER);
+
+ /*
+ * If there is a non-zero ssp, then 'dst' must be configured with a shadow
+ * stack and the fpu state should be up to date since it was just copied
+ * from the parent in fpu_clone(). So there must be a valid non-init CET
+ * state location in the buffer.
+ */
+ if (WARN_ON_ONCE(!xstate))
+ return 1;
+
+ xstate->user_ssp = (u64)ssp;
+#endif
+ return 0;
+}
+
/* Clone current's FPU state on fork */
-int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal)
+int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal,
+ unsigned long ssp)
{
struct fpu *src_fpu = &current->thread.fpu;
struct fpu *dst_fpu = &dst->thread.fpu;
@@ -613,6 +641,12 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal)
if (use_xsave())
dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID;
+ /*
+ * Update shadow stack pointer, in case it changed during clone.
+ */
+ if (update_fpu_shstk(dst, ssp))
+ return 1;
+
trace_x86_fpu_copy_src(src_fpu);
trace_x86_fpu_copy_dst(dst_fpu);
@@ -753,6 +787,24 @@ void switch_fpu_return(void)
}
EXPORT_SYMBOL_GPL(switch_fpu_return);
+void fpregs_lock_and_load(void)
+{
+ /*
+ * fpregs_lock() only disables preemption (mostly). So modifying state
+ * in an interrupt could screw up some in progress fpregs operation.
+ * Warn about it.
+ */
+ WARN_ON_ONCE(!irq_fpu_usable());
+ WARN_ON_ONCE(current->flags & PF_KTHREAD);
+
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ fpregs_restore_userregs();
+}
+
#ifdef CONFIG_X86_DEBUG_FPU
/*
* If current FPU state according to its tracking (loaded FPU context on this
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 6d056b68f4ed..6bc1eb2a21bd 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -8,6 +8,7 @@
#include <asm/fpu/api.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
+#include <asm/prctl.h>
#include "context.h"
#include "internal.h"
@@ -174,6 +175,86 @@ out:
return ret;
}
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+int ssp_active(struct task_struct *target, const struct user_regset *regset)
+{
+ if (target->thread.features & ARCH_SHSTK_SHSTK)
+ return regset->n;
+
+ return 0;
+}
+
+int ssp_get(struct task_struct *target, const struct user_regset *regset,
+ struct membuf to)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct cet_user_state *cetregs;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -ENODEV;
+
+ sync_fpstate(fpu);
+ cetregs = get_xsave_addr(&fpu->fpstate->regs.xsave, XFEATURE_CET_USER);
+ if (WARN_ON(!cetregs)) {
+ /*
+ * This shouldn't ever be NULL because shadow stack was
+ * verified to be enabled above. This means
+ * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so
+ * XFEATURE_CET_USER should not be in the init state.
+ */
+ return -ENODEV;
+ }
+
+ return membuf_write(&to, (unsigned long *)&cetregs->user_ssp,
+ sizeof(cetregs->user_ssp));
+}
+
+int ssp_set(struct task_struct *target, const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ struct fpu *fpu = &target->thread.fpu;
+ struct xregs_state *xsave = &fpu->fpstate->regs.xsave;
+ struct cet_user_state *cetregs;
+ unsigned long user_ssp;
+ int r;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !ssp_active(target, regset))
+ return -ENODEV;
+
+ if (pos != 0 || count != sizeof(user_ssp))
+ return -EINVAL;
+
+ r = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_ssp, 0, -1);
+ if (r)
+ return r;
+
+ /*
+ * Some kernel instructions (IRET, etc) can cause exceptions in the case
+ * of disallowed CET register values. Just prevent invalid values.
+ */
+ if (user_ssp >= TASK_SIZE_MAX || !IS_ALIGNED(user_ssp, 8))
+ return -EINVAL;
+
+ fpu_force_restore(fpu);
+
+ cetregs = get_xsave_addr(xsave, XFEATURE_CET_USER);
+ if (WARN_ON(!cetregs)) {
+ /*
+ * This shouldn't ever be NULL because shadow stack was
+ * verified to be enabled above. This means
+ * MSR_IA32_U_CET.CET_SHSTK_EN should be 1 and so
+ * XFEATURE_CET_USER should not be in the init state.
+ */
+ return -ENODEV;
+ }
+
+ cetregs->user_ssp = user_ssp;
+ return 0;
+}
+#endif /* CONFIG_X86_USER_SHADOW_STACK */
+
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 0bab497c9436..4fa4751912d9 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -39,26 +39,26 @@
*/
static const char *xfeature_names[] =
{
- "x87 floating point registers" ,
- "SSE registers" ,
- "AVX registers" ,
- "MPX bounds registers" ,
- "MPX CSR" ,
- "AVX-512 opmask" ,
- "AVX-512 Hi256" ,
- "AVX-512 ZMM_Hi256" ,
- "Processor Trace (unused)" ,
+ "x87 floating point registers",
+ "SSE registers",
+ "AVX registers",
+ "MPX bounds registers",
+ "MPX CSR",
+ "AVX-512 opmask",
+ "AVX-512 Hi256",
+ "AVX-512 ZMM_Hi256",
+ "Processor Trace (unused)",
"Protection Keys User registers",
"PASID state",
- "unknown xstate feature" ,
- "unknown xstate feature" ,
- "unknown xstate feature" ,
- "unknown xstate feature" ,
- "unknown xstate feature" ,
- "unknown xstate feature" ,
- "AMX Tile config" ,
- "AMX Tile data" ,
- "unknown xstate feature" ,
+ "Control-flow User registers",
+ "Control-flow Kernel registers (unused)",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "unknown xstate feature",
+ "AMX Tile config",
+ "AMX Tile data",
+ "unknown xstate feature",
};
static unsigned short xsave_cpuid_features[] __initdata = {
@@ -73,6 +73,7 @@ static unsigned short xsave_cpuid_features[] __initdata = {
[XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT,
[XFEATURE_PKRU] = X86_FEATURE_PKU,
[XFEATURE_PASID] = X86_FEATURE_ENQCMD,
+ [XFEATURE_CET_USER] = X86_FEATURE_SHSTK,
[XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE,
[XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE,
};
@@ -276,6 +277,7 @@ static void __init print_xstate_features(void)
print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
print_xstate_feature(XFEATURE_MASK_PKRU);
print_xstate_feature(XFEATURE_MASK_PASID);
+ print_xstate_feature(XFEATURE_MASK_CET_USER);
print_xstate_feature(XFEATURE_MASK_XTILE_CFG);
print_xstate_feature(XFEATURE_MASK_XTILE_DATA);
}
@@ -344,6 +346,7 @@ static __init void os_xrstor_booting(struct xregs_state *xstate)
XFEATURE_MASK_BNDREGS | \
XFEATURE_MASK_BNDCSR | \
XFEATURE_MASK_PASID | \
+ XFEATURE_MASK_CET_USER | \
XFEATURE_MASK_XTILE)
/*
@@ -446,14 +449,15 @@ static void __init __xstate_dump_leaves(void)
} \
} while (0)
-#define XCHECK_SZ(sz, nr, nr_macro, __struct) do { \
- if ((nr == nr_macro) && \
- WARN_ONCE(sz != sizeof(__struct), \
- "%s: struct is %zu bytes, cpu state %d bytes\n", \
- __stringify(nr_macro), sizeof(__struct), sz)) { \
+#define XCHECK_SZ(sz, nr, __struct) ({ \
+ if (WARN_ONCE(sz != sizeof(__struct), \
+ "[%s]: struct is %zu bytes, cpu state %d bytes\n", \
+ xfeature_names[nr], sizeof(__struct), sz)) { \
__xstate_dump_leaves(); \
} \
-} while (0)
+ true; \
+})
+
/**
* check_xtile_data_against_struct - Check tile data state size.
@@ -527,36 +531,28 @@ static bool __init check_xstate_against_struct(int nr)
* Ask the CPU for the size of the state.
*/
int sz = xfeature_size(nr);
+
/*
* Match each CPU state with the corresponding software
* structure.
*/
- XCHECK_SZ(sz, nr, XFEATURE_YMM, struct ymmh_struct);
- XCHECK_SZ(sz, nr, XFEATURE_BNDREGS, struct mpx_bndreg_state);
- XCHECK_SZ(sz, nr, XFEATURE_BNDCSR, struct mpx_bndcsr_state);
- XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
- XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
- XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
- XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
- XCHECK_SZ(sz, nr, XFEATURE_PASID, struct ia32_pasid_state);
- XCHECK_SZ(sz, nr, XFEATURE_XTILE_CFG, struct xtile_cfg);
-
- /* The tile data size varies between implementations. */
- if (nr == XFEATURE_XTILE_DATA)
- check_xtile_data_against_struct(sz);
-
- /*
- * Make *SURE* to add any feature numbers in below if
- * there are "holes" in the xsave state component
- * numbers.
- */
- if ((nr < XFEATURE_YMM) ||
- (nr >= XFEATURE_MAX) ||
- (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
- ((nr >= XFEATURE_RSRVD_COMP_11) && (nr <= XFEATURE_RSRVD_COMP_16))) {
+ switch (nr) {
+ case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct);
+ case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state);
+ case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state);
+ case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state);
+ case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state);
+ case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state);
+ case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state);
+ case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state);
+ case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg);
+ case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state);
+ case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true;
+ default:
XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr);
return false;
}
+
return true;
}
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index fb4f1e01b64a..eddb4fabc16f 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -342,7 +342,7 @@ STACK_FRAME_NON_STANDARD_FP(__fentry__)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
SYM_CODE_START(return_to_handler)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
subq $16, %rsp
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 222efd4a09bc..a5df3e994f04 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -42,7 +42,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
__HEAD
.code64
SYM_CODE_START_NOALIGN(startup_64)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded an identity mapped page table
@@ -61,23 +61,15 @@ SYM_CODE_START_NOALIGN(startup_64)
* tables and then reload them.
*/
- /* Set up the stack for verify_cpu(), similar to initial_stack below */
- leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp
+ /* Set up the stack for verify_cpu() */
+ leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp
leaq _text(%rip), %rdi
- /*
- * initial_gs points to initial fixed_percpu_data struct with storage for
- * the stack protector canary. Global pointer fixups are needed at this
- * stage, so apply them as is done in fixup_pointer(), and initialize %gs
- * such that the canary can be accessed at %gs:40 for subsequent C calls.
- */
+ /* Setup GSBASE to allow stack canary access for C code */
movl $MSR_GS_BASE, %ecx
- movq initial_gs(%rip), %rax
- movq $_text, %rdx
- subq %rdx, %rax
- addq %rdi, %rax
- movq %rax, %rdx
+ leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
+ movl %edx, %eax
shrq $32, %rdx
wrmsr
@@ -105,7 +97,7 @@ SYM_CODE_START_NOALIGN(startup_64)
lretq
.Lon_kernel_cs:
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
/* Sanitize CPU configuration */
call verify_cpu
@@ -127,7 +119,7 @@ SYM_CODE_START_NOALIGN(startup_64)
SYM_CODE_END(startup_64)
SYM_CODE_START(secondary_startup_64)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
@@ -156,7 +148,7 @@ SYM_CODE_START(secondary_startup_64)
* verify_cpu() above to make sure NX is enabled.
*/
SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
/*
@@ -238,16 +230,39 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
ANNOTATE_RETPOLINE_SAFE
jmp *%rax
1:
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // above
+#ifdef CONFIG_SMP
+ movl smpboot_control(%rip), %ecx
+
+ /* Get the per cpu offset for the given CPU# which is in ECX */
+ movq __per_cpu_offset(,%rcx,8), %rdx
+#else
+ xorl %edx, %edx /* zero-extended to clear all of RDX */
+#endif /* CONFIG_SMP */
+
+ /*
+ * Setup a boot time stack - Any secondary CPU will have lost its stack
+ * by now because the cr3-switch above unmaps the real-mode stack.
+ *
+ * RDX contains the per-cpu offset
+ */
+ movq pcpu_hot + X86_current_task(%rdx), %rax
+ movq TASK_threadsp(%rax), %rsp
+
/*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
* addresses where we're currently running on. We have to do that here
* because in 32bit we couldn't load a 64bit linear address.
*/
- lgdt early_gdt_descr(%rip)
+ subq $16, %rsp
+ movw $(GDT_SIZE-1), (%rsp)
+ leaq gdt_page(%rdx), %rax
+ movq %rax, 2(%rsp)
+ lgdt (%rsp)
+ addq $16, %rsp
/* set up data segments */
xorl %eax,%eax
@@ -271,16 +286,13 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
* the per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
- movl initial_gs(%rip),%eax
- movl initial_gs+4(%rip),%edx
+#ifndef CONFIG_SMP
+ leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx
+#endif
+ movl %edx, %eax
+ shrq $32, %rdx
wrmsr
- /*
- * Setup a boot time stack - Any secondary CPU will have lost its stack
- * by now because the cr3-switch above unmaps the real-mode stack
- */
- movq initial_stack(%rip), %rsp
-
/* Setup and Load IDT */
pushq %rsi
call early_setup_idt
@@ -371,8 +383,12 @@ SYM_CODE_END(secondary_startup_64)
*/
SYM_CODE_START(start_cpu0)
ANNOTATE_NOENDBR
- UNWIND_HINT_EMPTY
- movq initial_stack(%rip), %rsp
+ UNWIND_HINT_END_OF_STACK
+
+ /* Find the idle task stack */
+ movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
+ movq TASK_threadsp(%rcx), %rsp
+
jmp .Ljump_to_C_code
SYM_CODE_END(start_cpu0)
#endif
@@ -390,8 +406,6 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb)
UNWIND_HINT_IRET_REGS offset=8
ENDBR
- ANNOTATE_UNRET_END
-
/* Build pt_regs */
PUSH_AND_CLEAR_REGS
@@ -416,16 +430,9 @@ SYM_CODE_END(vc_boot_ghcb)
__REFDATA
.balign 8
SYM_DATA(initial_code, .quad x86_64_start_kernel)
-SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data))
#ifdef CONFIG_AMD_MEM_ENCRYPT
SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
#endif
-
-/*
- * The FRAME_SIZE gap is a convention which helps the in-kernel unwinder
- * reliably detect the end of the stack.
- */
-SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
__FINITDATA
__INIT
@@ -451,7 +458,6 @@ SYM_CODE_END(early_idt_handler_array)
SYM_CODE_START_LOCAL(early_idt_handler_common)
UNWIND_HINT_IRET_REGS offset=16
- ANNOTATE_UNRET_END
/*
* The stack is the hardware frame, an error code or zero, and the
* vector number.
@@ -501,8 +507,6 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb)
UNWIND_HINT_IRET_REGS offset=8
ENDBR
- ANNOTATE_UNRET_END
-
/* Build pt_regs */
PUSH_AND_CLEAR_REGS
@@ -657,8 +661,7 @@ SYM_DATA_END(level1_fixmap_pgt)
.data
.align 16
-SYM_DATA(early_gdt_descr, .word GDT_ENTRIES*8-1)
-SYM_DATA_LOCAL(early_gdt_descr_base, .quad INIT_PER_CPU_VAR(gdt_page))
+SYM_DATA(smpboot_control, .long 0)
.align 16
/* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index a58c6bc1cd68..5074b8420359 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -107,7 +107,7 @@ static const __initconst struct idt_data def_idts[] = {
ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
#endif
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
INTG(X86_TRAP_CP, asm_exc_control_protection),
#endif
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 9ff480e94511..670eb08b972a 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -77,15 +77,6 @@ static struct ctl_table itmt_kern_table[] = {
{}
};
-static struct ctl_table itmt_root_table[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = itmt_kern_table,
- },
- {}
-};
-
static struct ctl_table_header *itmt_sysctl_header;
/**
@@ -114,7 +105,7 @@ int sched_set_itmt_support(void)
return 0;
}
- itmt_sysctl_header = register_sysctl_table(itmt_root_table);
+ itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table);
if (!itmt_sysctl_header) {
mutex_unlock(&itmt_update_mutex);
return -ENOMEM;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 42e182868873..ac10b46c5832 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -64,11 +64,11 @@ static unsigned paravirt_patch_call(void *insn_buff, const void *target,
}
#ifdef CONFIG_PARAVIRT_XXL
-/* identity function, which can be inlined */
-u64 notrace _paravirt_ident_64(u64 x)
-{
- return x;
-}
+DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text);
+DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
+DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text);
+DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text);
+DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
#endif
DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
@@ -197,11 +197,6 @@ void paravirt_end_context_switch(struct task_struct *next)
arch_enter_lazy_mmu_mode();
}
-static noinstr unsigned long pv_native_read_cr2(void)
-{
- return native_read_cr2();
-}
-
static noinstr void pv_native_write_cr2(unsigned long val)
{
native_write_cr2(val);
@@ -222,16 +217,6 @@ noinstr void pv_native_wbinvd(void)
native_wbinvd();
}
-static noinstr void pv_native_irq_enable(void)
-{
- native_irq_enable();
-}
-
-static noinstr void pv_native_irq_disable(void)
-{
- native_irq_disable();
-}
-
static noinstr void pv_native_safe_halt(void)
{
native_safe_halt();
@@ -298,7 +283,7 @@ struct paravirt_patch_template pv_ops = {
.cpu.end_context_switch = paravirt_nop,
/* Irq ops. */
- .irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
+ .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable),
.irq.safe_halt = pv_native_safe_halt,
@@ -363,8 +348,7 @@ struct paravirt_patch_template pv_ops = {
.mmu.make_pte = PTE_IDENT,
.mmu.make_pgd = PTE_IDENT,
- .mmu.dup_mmap = paravirt_nop,
- .mmu.activate_mm = paravirt_nop,
+ .mmu.enter_mmap = paravirt_nop,
.mmu.lazy_mode = {
.enter = paravirt_nop,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b650cde3f64d..3ab62ac98c2c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -5,6 +5,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/prctl.h>
#include <linux/slab.h>
#include <linux/sched.h>
@@ -48,6 +49,8 @@
#include <asm/frame.h>
#include <asm/unwind.h>
#include <asm/tdx.h>
+#include <asm/mmu_context.h>
+#include <asm/shstk.h>
#include "process.h"
@@ -119,6 +122,7 @@ void exit_thread(struct task_struct *tsk)
free_vm86(t);
+ shstk_free(tsk);
fpu__drop(fpu);
}
@@ -140,6 +144,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
struct inactive_task_frame *frame;
struct fork_frame *fork_frame;
struct pt_regs *childregs;
+ unsigned long new_ssp;
int ret = 0;
childregs = task_pt_regs(p);
@@ -162,6 +167,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
+
+ if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM)
+ set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags);
#else
p->thread.sp0 = (unsigned long) (childregs + 1);
savesegment(gs, p->thread.gs);
@@ -174,7 +182,16 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
frame->flags = X86_EFLAGS_FIXED;
#endif
- fpu_clone(p, clone_flags, args->fn);
+ /*
+ * Allocate a new shadow stack for thread if needed. If shadow stack,
+ * is disabled, new_ssp will remain 0, and fpu_clone() will know not to
+ * update it.
+ */
+ new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size);
+ if (IS_ERR_VALUE(new_ssp))
+ return PTR_ERR((void *)new_ssp);
+
+ fpu_clone(p, clone_flags, args->fn, new_ssp);
/* Kernel thread ? */
if (unlikely(p->flags & PF_KTHREAD)) {
@@ -220,6 +237,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
io_bitmap_share(p);
+ /*
+ * If copy_thread() if failing, don't leak the shadow stack possibly
+ * allocated in shstk_alloc_thread_stack() above.
+ */
+ if (ret)
+ shstk_free(p);
+
return ret;
}
@@ -368,6 +392,8 @@ void arch_setup_new_exec(void)
task_clear_spec_ssb_noexec(current);
speculation_ctrl_update(read_thread_flags());
}
+
+ mm_reset_untag_mask(current->mm);
}
#ifdef CONFIG_X86_IOPL_IOPERM
@@ -715,7 +741,7 @@ static bool x86_idle_set(void)
}
#ifndef CONFIG_SMP
-static inline void play_dead(void)
+static inline void __noreturn play_dead(void)
{
BUG();
}
@@ -727,7 +753,7 @@ void arch_cpu_idle_enter(void)
local_touch_nmi();
}
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
play_dead();
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bb65a68b4b49..3bcbc165c5bb 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -39,6 +39,7 @@
#include <linux/io.h>
#include <linux/ftrace.h>
#include <linux/syscalls.h>
+#include <linux/iommu.h>
#include <asm/processor.h>
#include <asm/pkru.h>
@@ -514,6 +515,8 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
load_gs_index(__USER_DS);
}
+ reset_thread_features();
+
loadsegment(fs, 0);
loadsegment(es, _ds);
loadsegment(ds, _ds);
@@ -671,7 +674,7 @@ void set_personality_64bit(void)
task_pt_regs(current)->orig_ax = __NR_execve;
current_thread_info()->status &= ~TS_COMPAT;
if (current->mm)
- current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL;
+ __set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags);
/* TBD: overwrites user setup. Should have two bits.
But 64bit processes have always behaved this way,
@@ -708,7 +711,7 @@ static void __set_personality_ia32(void)
* uprobes applied to this MM need to know this and
* cannot use user_64bit_mode() at that time.
*/
- current->mm->context.flags = MM_CONTEXT_UPROBE_IA32;
+ __set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags);
}
current->personality |= force_personality32;
@@ -743,6 +746,52 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
}
#endif
+#ifdef CONFIG_ADDRESS_MASKING
+
+#define LAM_U57_BITS 6
+
+static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_LAM))
+ return -ENODEV;
+
+ /* PTRACE_ARCH_PRCTL */
+ if (current->mm != mm)
+ return -EINVAL;
+
+ if (mm_valid_pasid(mm) &&
+ !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags))
+ return -EINTR;
+
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
+ mmap_write_unlock(mm);
+ return -EBUSY;
+ }
+
+ if (!nr_bits) {
+ mmap_write_unlock(mm);
+ return -EINVAL;
+ } else if (nr_bits <= LAM_U57_BITS) {
+ mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
+ mm->context.untag_mask = ~GENMASK(62, 57);
+ } else {
+ mmap_write_unlock(mm);
+ return -EINVAL;
+ }
+
+ write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
+ set_tlbstate_lam_mode(mm);
+ set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
+
+ mmap_write_unlock(mm);
+
+ return 0;
+}
+#endif
+
long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
{
int ret = 0;
@@ -830,7 +879,27 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
case ARCH_MAP_VDSO_64:
return prctl_map_vdso(&vdso_image_64, arg2);
#endif
-
+#ifdef CONFIG_ADDRESS_MASKING
+ case ARCH_GET_UNTAG_MASK:
+ return put_user(task->mm->context.untag_mask,
+ (unsigned long __user *)arg2);
+ case ARCH_ENABLE_TAGGED_ADDR:
+ return prctl_enable_tagged_addr(task->mm, arg2);
+ case ARCH_FORCE_TAGGED_SVA:
+ set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags);
+ return 0;
+ case ARCH_GET_MAX_TAG_BITS:
+ if (!cpu_feature_enabled(X86_FEATURE_LAM))
+ return put_user(0, (unsigned long __user *)arg2);
+ else
+ return put_user(LAM_U57_BITS, (unsigned long __user *)arg2);
+#endif
+ case ARCH_SHSTK_ENABLE:
+ case ARCH_SHSTK_DISABLE:
+ case ARCH_SHSTK_LOCK:
+ case ARCH_SHSTK_UNLOCK:
+ case ARCH_SHSTK_STATUS:
+ return shstk_prctl(task, option, arg2);
default:
ret = -EINVAL;
break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index dfaa270a7cc9..095f04bdabdc 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -58,6 +58,7 @@ enum x86_regset_64 {
REGSET64_FP,
REGSET64_IOPERM,
REGSET64_XSTATE,
+ REGSET64_SSP,
};
#define REGSET_GENERAL \
@@ -1267,6 +1268,17 @@ static struct user_regset x86_64_regsets[] __ro_after_init = {
.active = ioperm_active,
.regset_get = ioperm_get
},
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ [REGSET64_SSP] = {
+ .core_note_type = NT_X86_SHSTK,
+ .n = 1,
+ .size = sizeof(u64),
+ .align = sizeof(u64),
+ .active = ssp_active,
+ .regset_get = ssp_get,
+ .set = ssp_set
+ },
+#endif
};
static const struct user_regset_view user_x86_64_view = {
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4a73351f87f8..56cab1bb25f5 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -43,7 +43,7 @@
.code64
SYM_CODE_START_NOALIGN(relocate_range)
SYM_CODE_START_NOALIGN(relocate_kernel)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
/*
* %rdi indirection_page
@@ -113,7 +113,7 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
/* set return address to 0 if not preserving context */
pushq $0
/* store the start address on the stack */
@@ -231,7 +231,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // RET target, above
movq RSP(%r8), %rsp
movq CR4(%r8), %rax
@@ -256,8 +256,8 @@ SYM_CODE_END(virtual_mapped)
/* Do the copies */
SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
- UNWIND_HINT_EMPTY
- movq %rdi, %rcx /* Put the page_list in %rcx */
+ UNWIND_HINT_END_OF_STACK
+ movq %rdi, %rcx /* Put the page_list in %rcx */
xorl %edi, %edi
xorl %esi, %esi
jmp 1f
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 3f664ab277c4..b031244d6d2d 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -22,6 +22,8 @@
#include <linux/efi.h>
#include <linux/platform_device.h>
#include <linux/io.h>
+#include <linux/psp-sev.h>
+#include <uapi/linux/sev-guest.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h>
@@ -2175,7 +2177,7 @@ static int __init init_sev_config(char *str)
}
__setup("sev=", init_sev_config);
-int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err)
+int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
{
struct ghcb_state state;
struct es_em_ctxt ctxt;
@@ -2183,8 +2185,7 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned
struct ghcb *ghcb;
int ret;
- if (!fw_err)
- return -EINVAL;
+ rio->exitinfo2 = SEV_RET_NO_FW_CALL;
/*
* __sev_get_ghcb() needs to run with IRQs disabled because it is using
@@ -2209,16 +2210,16 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned
if (ret)
goto e_put;
- *fw_err = ghcb->save.sw_exit_info_2;
- switch (*fw_err) {
+ rio->exitinfo2 = ghcb->save.sw_exit_info_2;
+ switch (rio->exitinfo2) {
case 0:
break;
- case SNP_GUEST_REQ_ERR_BUSY:
+ case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY):
ret = -EAGAIN;
break;
- case SNP_GUEST_REQ_INVALID_LEN:
+ case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN):
/* Number of expected pages are returned in RBX */
if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
input->data_npages = ghcb_get_rbx(ghcb);
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
new file mode 100644
index 000000000000..1f767c509ee9
--- /dev/null
+++ b/arch/x86/kernel/shstk.c
@@ -0,0 +1,499 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * shstk.c - Intel shadow stack support
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ * Yu-cheng Yu <yu-cheng.yu@intel.com>
+ */
+
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/sched/signal.h>
+#include <linux/compat.h>
+#include <linux/sizes.h>
+#include <linux/user.h>
+#include <linux/syscalls.h>
+#include <asm/msr.h>
+#include <asm/fpu/xstate.h>
+#include <asm/fpu/types.h>
+#include <asm/shstk.h>
+#include <asm/special_insns.h>
+#include <asm/fpu/api.h>
+#include <asm/prctl.h>
+
+#define SS_FRAME_SIZE 8
+
+static bool features_enabled(unsigned long features)
+{
+ return current->thread.features & features;
+}
+
+static void features_set(unsigned long features)
+{
+ current->thread.features |= features;
+}
+
+static void features_clr(unsigned long features)
+{
+ current->thread.features &= ~features;
+}
+
+/*
+ * Create a restore token on the shadow stack. A token is always 8-byte
+ * and aligned to 8.
+ */
+static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
+{
+ unsigned long addr;
+
+ /* Token must be aligned */
+ if (!IS_ALIGNED(ssp, 8))
+ return -EINVAL;
+
+ addr = ssp - SS_FRAME_SIZE;
+
+ /*
+ * SSP is aligned, so reserved bits and mode bit are a zero, just mark
+ * the token 64-bit.
+ */
+ ssp |= BIT(0);
+
+ if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
+ return -EFAULT;
+
+ if (token_addr)
+ *token_addr = addr;
+
+ return 0;
+}
+
+static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
+ unsigned long token_offset, bool set_res_tok)
+{
+ int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
+ struct mm_struct *mm = current->mm;
+ unsigned long mapped_addr, unused;
+
+ if (addr)
+ flags |= MAP_FIXED_NOREPLACE;
+
+ mmap_write_lock(mm);
+ mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
+ VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
+ mmap_write_unlock(mm);
+
+ if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
+ goto out;
+
+ if (create_rstor_token(mapped_addr + token_offset, NULL)) {
+ vm_munmap(mapped_addr, size);
+ return -EINVAL;
+ }
+
+out:
+ return mapped_addr;
+}
+
+static unsigned long adjust_shstk_size(unsigned long size)
+{
+ if (size)
+ return PAGE_ALIGN(size);
+
+ return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
+}
+
+static void unmap_shadow_stack(u64 base, u64 size)
+{
+ while (1) {
+ int r;
+
+ r = vm_munmap(base, size);
+
+ /*
+ * vm_munmap() returns -EINTR when mmap_lock is held by
+ * something else, and that lock should not be held for a
+ * long time. Retry it for the case.
+ */
+ if (r == -EINTR) {
+ cond_resched();
+ continue;
+ }
+
+ /*
+ * For all other types of vm_munmap() failure, either the
+ * system is out of memory or there is bug.
+ */
+ WARN_ON_ONCE(r);
+ break;
+ }
+}
+
+static int shstk_setup(void)
+{
+ struct thread_shstk *shstk = &current->thread.shstk;
+ unsigned long addr, size;
+
+ /* Already enabled */
+ if (features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ /* Also not supported for 32 bit and x32 */
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall())
+ return -EOPNOTSUPP;
+
+ size = adjust_shstk_size(0);
+ addr = alloc_shstk(0, size, 0, false);
+ if (IS_ERR_VALUE(addr))
+ return PTR_ERR((void *)addr);
+
+ fpregs_lock_and_load();
+ wrmsrl(MSR_IA32_PL3_SSP, addr + size);
+ wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
+ fpregs_unlock();
+
+ shstk->base = addr;
+ shstk->size = size;
+ features_set(ARCH_SHSTK_SHSTK);
+
+ return 0;
+}
+
+void reset_thread_features(void)
+{
+ memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
+ current->thread.features = 0;
+ current->thread.features_locked = 0;
+}
+
+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
+ unsigned long stack_size)
+{
+ struct thread_shstk *shstk = &tsk->thread.shstk;
+ unsigned long addr, size;
+
+ /*
+ * If shadow stack is not enabled on the new thread, skip any
+ * switch to a new shadow stack.
+ */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ /*
+ * For CLONE_VM, except vfork, the child needs a separate shadow
+ * stack.
+ */
+ if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM)
+ return 0;
+
+ size = adjust_shstk_size(stack_size);
+ addr = alloc_shstk(0, size, 0, false);
+ if (IS_ERR_VALUE(addr))
+ return addr;
+
+ shstk->base = addr;
+ shstk->size = size;
+
+ return addr + size;
+}
+
+static unsigned long get_user_shstk_addr(void)
+{
+ unsigned long long ssp;
+
+ fpregs_lock_and_load();
+
+ rdmsrl(MSR_IA32_PL3_SSP, ssp);
+
+ fpregs_unlock();
+
+ return ssp;
+}
+
+#define SHSTK_DATA_BIT BIT(63)
+
+static int put_shstk_data(u64 __user *addr, u64 data)
+{
+ if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
+ return -EINVAL;
+
+ /*
+ * Mark the high bit so that the sigframe can't be processed as a
+ * return address.
+ */
+ if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
+ return -EFAULT;
+ return 0;
+}
+
+static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
+{
+ unsigned long ldata;
+
+ if (unlikely(get_user(ldata, addr)))
+ return -EFAULT;
+
+ if (!(ldata & SHSTK_DATA_BIT))
+ return -EINVAL;
+
+ *data = ldata & ~SHSTK_DATA_BIT;
+
+ return 0;
+}
+
+static int shstk_push_sigframe(unsigned long *ssp)
+{
+ unsigned long target_ssp = *ssp;
+
+ /* Token must be aligned */
+ if (!IS_ALIGNED(target_ssp, 8))
+ return -EINVAL;
+
+ *ssp -= SS_FRAME_SIZE;
+ if (put_shstk_data((void *__user)*ssp, target_ssp))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int shstk_pop_sigframe(unsigned long *ssp)
+{
+ unsigned long token_addr;
+ int err;
+
+ err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
+ if (unlikely(err))
+ return err;
+
+ /* Restore SSP aligned? */
+ if (unlikely(!IS_ALIGNED(token_addr, 8)))
+ return -EINVAL;
+
+ /* SSP in userspace? */
+ if (unlikely(token_addr >= TASK_SIZE_MAX))
+ return -EINVAL;
+
+ *ssp = token_addr;
+
+ return 0;
+}
+
+int setup_signal_shadow_stack(struct ksignal *ksig)
+{
+ void __user *restorer = ksig->ka.sa.sa_restorer;
+ unsigned long ssp;
+ int err;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ if (!restorer)
+ return -EINVAL;
+
+ ssp = get_user_shstk_addr();
+ if (unlikely(!ssp))
+ return -EINVAL;
+
+ err = shstk_push_sigframe(&ssp);
+ if (unlikely(err))
+ return err;
+
+ /* Push restorer address */
+ ssp -= SS_FRAME_SIZE;
+ err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
+ if (unlikely(err))
+ return -EFAULT;
+
+ fpregs_lock_and_load();
+ wrmsrl(MSR_IA32_PL3_SSP, ssp);
+ fpregs_unlock();
+
+ return 0;
+}
+
+int restore_signal_shadow_stack(void)
+{
+ unsigned long ssp;
+ int err;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ ssp = get_user_shstk_addr();
+ if (unlikely(!ssp))
+ return -EINVAL;
+
+ err = shstk_pop_sigframe(&ssp);
+ if (unlikely(err))
+ return err;
+
+ fpregs_lock_and_load();
+ wrmsrl(MSR_IA32_PL3_SSP, ssp);
+ fpregs_unlock();
+
+ return 0;
+}
+
+void shstk_free(struct task_struct *tsk)
+{
+ struct thread_shstk *shstk = &tsk->thread.shstk;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !features_enabled(ARCH_SHSTK_SHSTK))
+ return;
+
+ /*
+ * When fork() with CLONE_VM fails, the child (tsk) already has a
+ * shadow stack allocated, and exit_thread() calls this function to
+ * free it. In this case the parent (current) and the child share
+ * the same mm struct.
+ */
+ if (!tsk->mm || tsk->mm != current->mm)
+ return;
+
+ unmap_shadow_stack(shstk->base, shstk->size);
+}
+
+static int wrss_control(bool enable)
+{
+ u64 msrval;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ /*
+ * Only enable WRSS if shadow stack is enabled. If shadow stack is not
+ * enabled, WRSS will already be disabled, so don't bother clearing it
+ * when disabling.
+ */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return -EPERM;
+
+ /* Already enabled/disabled? */
+ if (features_enabled(ARCH_SHSTK_WRSS) == enable)
+ return 0;
+
+ fpregs_lock_and_load();
+ rdmsrl(MSR_IA32_U_CET, msrval);
+
+ if (enable) {
+ features_set(ARCH_SHSTK_WRSS);
+ msrval |= CET_WRSS_EN;
+ } else {
+ features_clr(ARCH_SHSTK_WRSS);
+ if (!(msrval & CET_WRSS_EN))
+ goto unlock;
+
+ msrval &= ~CET_WRSS_EN;
+ }
+
+ wrmsrl(MSR_IA32_U_CET, msrval);
+
+unlock:
+ fpregs_unlock();
+
+ return 0;
+}
+
+static int shstk_disable(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ /* Already disabled? */
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ fpregs_lock_and_load();
+ /* Disable WRSS too when disabling shadow stack */
+ wrmsrl(MSR_IA32_U_CET, 0);
+ wrmsrl(MSR_IA32_PL3_SSP, 0);
+ fpregs_unlock();
+
+ shstk_free(current);
+ features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
+
+ return 0;
+}
+
+SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
+{
+ bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
+ unsigned long aligned_size;
+
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ return -EOPNOTSUPP;
+
+ if (flags & ~SHADOW_STACK_SET_TOKEN)
+ return -EINVAL;
+
+ /* If there isn't space for a token */
+ if (set_tok && size < 8)
+ return -ENOSPC;
+
+ if (addr && addr < SZ_4G)
+ return -ERANGE;
+
+ /*
+ * An overflow would result in attempting to write the restore token
+ * to the wrong location. Not catastrophic, but just return the right
+ * error code and block it.
+ */
+ aligned_size = PAGE_ALIGN(size);
+ if (aligned_size < size)
+ return -EOVERFLOW;
+
+ return alloc_shstk(addr, aligned_size, size, set_tok);
+}
+
+long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
+{
+ unsigned long features = arg2;
+
+ if (option == ARCH_SHSTK_STATUS) {
+ return put_user(task->thread.features, (unsigned long __user *)arg2);
+ }
+
+ if (option == ARCH_SHSTK_LOCK) {
+ task->thread.features_locked |= features;
+ return 0;
+ }
+
+ /* Only allow via ptrace */
+ if (task != current) {
+ if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
+ task->thread.features_locked &= ~features;
+ return 0;
+ }
+ return -EINVAL;
+ }
+
+ /* Do not allow to change locked features */
+ if (features & task->thread.features_locked)
+ return -EPERM;
+
+ /* Only support enabling/disabling one feature at a time. */
+ if (hweight_long(features) > 1)
+ return -EINVAL;
+
+ if (option == ARCH_SHSTK_DISABLE) {
+ if (features & ARCH_SHSTK_WRSS)
+ return wrss_control(false);
+ if (features & ARCH_SHSTK_SHSTK)
+ return shstk_disable();
+ return -EINVAL;
+ }
+
+ /* Handle ARCH_SHSTK_ENABLE */
+ if (features & ARCH_SHSTK_SHSTK)
+ return shstk_setup();
+ if (features & ARCH_SHSTK_WRSS)
+ return wrss_control(true);
+ return -EINVAL;
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 004cb30b7419..356253e85ce9 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -40,6 +40,7 @@
#include <asm/syscall.h>
#include <asm/sigframe.h>
#include <asm/signal.h>
+#include <asm/shstk.h>
static inline int is_ia32_compat_frame(struct ksignal *ksig)
{
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 9027fc088f97..c12624bc82a3 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -402,7 +402,7 @@ Efault:
*/
static_assert(NSIGILL == 11);
static_assert(NSIGFPE == 15);
-static_assert(NSIGSEGV == 9);
+static_assert(NSIGSEGV == 10);
static_assert(NSIGBUS == 5);
static_assert(NSIGTRAP == 6);
static_assert(NSIGCHLD == 6);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 13a1e6083837..cacf2ede6217 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -175,6 +175,9 @@ int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
frame = get_sigframe(ksig, regs, sizeof(struct rt_sigframe), &fp);
uc_flags = frame_uc_flags(regs);
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
@@ -260,6 +263,9 @@ SYSCALL_DEFINE0(rt_sigreturn)
if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
+ if (restore_signal_shadow_stack())
+ goto badframe;
+
if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
@@ -403,7 +409,7 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
*/
static_assert(NSIGILL == 11);
static_assert(NSIGFPE == 15);
-static_assert(NSIGSEGV == 9);
+static_assert(NSIGSEGV == 10);
static_assert(NSIGBUS == 5);
static_assert(NSIGTRAP == 6);
static_assert(NSIGCHLD == 6);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9013bb28255a..851477f7d728 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -121,17 +121,20 @@ int arch_update_cpu_topology(void)
return retval;
}
+
+static unsigned int smpboot_warm_reset_vector_count;
+
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
unsigned long flags;
spin_lock_irqsave(&rtc_lock, flags);
- CMOS_WRITE(0xa, 0xf);
+ if (!smpboot_warm_reset_vector_count++) {
+ CMOS_WRITE(0xa, 0xf);
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = start_eip >> 4;
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = start_eip & 0xf;
+ }
spin_unlock_irqrestore(&rtc_lock, flags);
- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
- start_eip >> 4;
- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
- start_eip & 0xf;
}
static inline void smpboot_restore_warm_reset_vector(void)
@@ -143,10 +146,12 @@ static inline void smpboot_restore_warm_reset_vector(void)
* to default values.
*/
spin_lock_irqsave(&rtc_lock, flags);
- CMOS_WRITE(0, 0xf);
+ if (!--smpboot_warm_reset_vector_count) {
+ CMOS_WRITE(0, 0xf);
+ *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
+ }
spin_unlock_irqrestore(&rtc_lock, flags);
- *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
/*
@@ -1059,8 +1064,6 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle);
-#else
- initial_gs = per_cpu_offset(cpu);
#endif
return 0;
}
@@ -1086,9 +1089,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
start_ip = real_mode_header->trampoline_start64;
#endif
idle->thread.sp = (unsigned long)task_pt_regs(idle);
- early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
- initial_stack = idle->thread.sp;
+
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
+ initial_stack = idle->thread.sp;
+ } else {
+ smpboot_control = cpu;
+ }
/* Enable the espfix hack for this CPU */
init_espfix_ap(cpu);
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 8cc653ffdccd..c783aeb37dce 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -193,7 +193,11 @@ get_unmapped_area:
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = PAGE_SIZE;
+ if (!in_32bit_syscall() && (flags & MAP_ABOVE4G))
+ info.low_limit = SZ_4G;
+ else
+ info.low_limit = PAGE_SIZE;
+
info.high_limit = get_mmap_base(0);
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 492a60febb11..f358350624b2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -77,18 +77,6 @@
DECLARE_BITMAP(system_vectors, NR_VECTORS);
-static inline void cond_local_irq_enable(struct pt_regs *regs)
-{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_enable();
-}
-
-static inline void cond_local_irq_disable(struct pt_regs *regs)
-{
- if (regs->flags & X86_EFLAGS_IF)
- local_irq_disable();
-}
-
__always_inline int is_valid_bugaddr(unsigned long addr)
{
if (addr < TASK_SIZE_MAX)
@@ -213,81 +201,6 @@ DEFINE_IDTENTRY(exc_overflow)
do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL);
}
-#ifdef CONFIG_X86_KERNEL_IBT
-
-static __ro_after_init bool ibt_fatal = true;
-
-extern void ibt_selftest_ip(void); /* code label defined in asm below */
-
-enum cp_error_code {
- CP_EC = (1 << 15) - 1,
-
- CP_RET = 1,
- CP_IRET = 2,
- CP_ENDBR = 3,
- CP_RSTRORSSP = 4,
- CP_SETSSBSY = 5,
-
- CP_ENCL = 1 << 15,
-};
-
-DEFINE_IDTENTRY_ERRORCODE(exc_control_protection)
-{
- if (!cpu_feature_enabled(X86_FEATURE_IBT)) {
- pr_err("Unexpected #CP\n");
- BUG();
- }
-
- if (WARN_ON_ONCE(user_mode(regs) || (error_code & CP_EC) != CP_ENDBR))
- return;
-
- if (unlikely(regs->ip == (unsigned long)&ibt_selftest_ip)) {
- regs->ax = 0;
- return;
- }
-
- pr_err("Missing ENDBR: %pS\n", (void *)instruction_pointer(regs));
- if (!ibt_fatal) {
- printk(KERN_DEFAULT CUT_HERE);
- __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL);
- return;
- }
- BUG();
-}
-
-/* Must be noinline to ensure uniqueness of ibt_selftest_ip. */
-noinline bool ibt_selftest(void)
-{
- unsigned long ret;
-
- asm (" lea ibt_selftest_ip(%%rip), %%rax\n\t"
- ANNOTATE_RETPOLINE_SAFE
- " jmp *%%rax\n\t"
- "ibt_selftest_ip:\n\t"
- UNWIND_HINT_FUNC
- ANNOTATE_NOENDBR
- " nop\n\t"
-
- : "=a" (ret) : : "memory");
-
- return !ret;
-}
-
-static int __init ibt_setup(char *str)
-{
- if (!strcmp(str, "off"))
- setup_clear_cpu_cap(X86_FEATURE_IBT);
-
- if (!strcmp(str, "warn"))
- ibt_fatal = false;
-
- return 1;
-}
-
-__setup("ibt=", ibt_setup);
-
-#endif /* CONFIG_X86_KERNEL_IBT */
-
#ifdef CONFIG_X86_F00F_BUG
void handle_invalid_op(struct pt_regs *regs)
#else
@@ -671,15 +584,15 @@ static bool try_fixup_enqcmd_gp(void)
if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
return false;
- pasid = current->mm->pasid;
-
/*
* If the mm has not been allocated a
* PASID, the #GP can not be fixed up.
*/
- if (!pasid_valid(pasid))
+ if (!mm_valid_pasid(current->mm))
return false;
+ pasid = current->mm->pasid;
+
/*
* Did this thread already have its PASID activated?
* If so, the #GP must be from something else.
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 37307b40f8da..3ac50b7298d1 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -133,7 +133,7 @@ static struct orc_entry null_orc_entry = {
.sp_offset = sizeof(long),
.sp_reg = ORC_REG_SP,
.bp_reg = ORC_REG_UNDEFINED,
- .type = UNWIND_HINT_TYPE_CALL
+ .type = ORC_TYPE_CALL
};
#ifdef CONFIG_CALL_THUNKS
@@ -153,12 +153,11 @@ static struct orc_entry *orc_callthunk_find(unsigned long ip)
/* Fake frame pointer entry -- used as a fallback for generated code */
static struct orc_entry orc_fp_entry = {
- .type = UNWIND_HINT_TYPE_CALL,
+ .type = ORC_TYPE_CALL,
.sp_reg = ORC_REG_BP,
.sp_offset = 16,
.bp_reg = ORC_REG_PREV_SP,
.bp_offset = -16,
- .end = 0,
};
static struct orc_entry *orc_find(unsigned long ip)
@@ -250,13 +249,13 @@ static int orc_sort_cmp(const void *_a, const void *_b)
return -1;
/*
- * The "weak" section terminator entries need to always be on the left
+ * The "weak" section terminator entries need to always be first
* to ensure the lookup code skips them in favor of real entries.
* These terminator entries exist to handle any gaps created by
* whitelisted .o files which didn't get objtool generation.
*/
orc_a = cur_orc_table + (a - cur_orc_ip_table);
- return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1;
+ return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1;
}
void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
@@ -474,14 +473,12 @@ bool unwind_next_frame(struct unwind_state *state)
*/
orc = &orc_fp_entry;
state->error = true;
- }
-
- /* End-of-stack check for kernel threads: */
- if (orc->sp_reg == ORC_REG_UNDEFINED) {
- if (!orc->end)
+ } else {
+ if (orc->type == ORC_TYPE_UNDEFINED)
goto err;
- goto the_end;
+ if (orc->type == ORC_TYPE_END_OF_STACK)
+ goto the_end;
}
state->signal = orc->signal;
@@ -554,7 +551,7 @@ bool unwind_next_frame(struct unwind_state *state)
/* Find IP, SP and possibly regs: */
switch (orc->type) {
- case UNWIND_HINT_TYPE_CALL:
+ case ORC_TYPE_CALL:
ip_p = sp - sizeof(long);
if (!deref_stack_reg(state, ip_p, &state->ip))
@@ -567,7 +564,7 @@ bool unwind_next_frame(struct unwind_state *state)
state->prev_regs = NULL;
break;
- case UNWIND_HINT_TYPE_REGS:
+ case ORC_TYPE_REGS:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
orc_warn_current("can't access registers at %pB\n",
(void *)orig_ip);
@@ -590,13 +587,13 @@ bool unwind_next_frame(struct unwind_state *state)
state->full_regs = true;
break;
- case UNWIND_HINT_TYPE_REGS_PARTIAL:
+ case ORC_TYPE_REGS_PARTIAL:
if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
orc_warn_current("can't access iret registers at %pB\n",
(void *)orig_ip);
goto err;
}
- /* See UNWIND_HINT_TYPE_REGS case comment. */
+ /* See ORC_TYPE_REGS case comment. */
state->ip = unwind_recover_rethook(state, state->ip,
(unsigned long *)(state->sp - sizeof(long)));
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ef80d361b463..95be3831df73 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -134,6 +134,7 @@ static void enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool
static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return false; }
static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; }
+static bool is_private_mmio_noop(u64 addr) {return false; }
struct x86_platform_ops x86_platform __ro_after_init = {
.calibrate_cpu = native_calibrate_cpu_early,
@@ -149,6 +150,7 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.realmode_reserve = reserve_real_mode,
.realmode_init = init_real_mode,
.hyper.pin_vcpu = x86_op_int_noop,
+ .hyper.is_private_mmio = is_private_mmio_noop,
.guest = {
.enc_status_change_prepare = enc_status_change_prepare_noop,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index f25bc3cbb250..a1b08359769b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -27,6 +27,7 @@
#include <linux/swap.h>
#include <linux/rwsem.h>
#include <linux/cc_platform.h>
+#include <linux/smp.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -41,6 +42,9 @@
#include <asm/fpu/api.h>
#include <asm/virtext.h>
+
+#include <trace/events/ipi.h>
+
#include "trace.h"
#include "svm.h"
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d852ce84920..2b1d82647195 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -60,7 +60,9 @@
#include <linux/mem_encrypt.h>
#include <linux/entry-kvm.h>
#include <linux/suspend.h>
+#include <linux/smp.h>
+#include <trace/events/ipi.h>
#include <trace/events/kvm.h>
#include <asm/debugreg.h>
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index b70d98d79a9d..b64a2bd1a1ef 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -37,22 +37,22 @@
#define ASM_BARRIER_NOSPEC ALTERNATIVE "", "lfence", X86_FEATURE_LFENCE_RDTSC
-#ifdef CONFIG_X86_5LEVEL
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rdx), \
- __stringify(mov $((1 << 56) - 4096 - (n)),%rdx), X86_FEATURE_LA57
-#else
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- mov $(TASK_SIZE_MAX - (n)),%_ASM_DX
-#endif
+.macro check_range size:req
+.if IS_ENABLED(CONFIG_X86_64)
+ mov %rax, %rdx
+ sar $63, %rdx
+ or %rdx, %rax
+.else
+ cmp $TASK_SIZE_MAX-\size+1, %eax
+ jae .Lbad_get_user
+ sbb %edx, %edx /* array_index_mask_nospec() */
+ and %edx, %eax
+.endif
+.endm
.text
SYM_FUNC_START(__get_user_1)
- LOAD_TASK_SIZE_MINUS_N(0)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ check_range size=1
ASM_STAC
1: movzbl (%_ASM_AX),%edx
xor %eax,%eax
@@ -62,11 +62,7 @@ SYM_FUNC_END(__get_user_1)
EXPORT_SYMBOL(__get_user_1)
SYM_FUNC_START(__get_user_2)
- LOAD_TASK_SIZE_MINUS_N(1)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ check_range size=2
ASM_STAC
2: movzwl (%_ASM_AX),%edx
xor %eax,%eax
@@ -76,11 +72,7 @@ SYM_FUNC_END(__get_user_2)
EXPORT_SYMBOL(__get_user_2)
SYM_FUNC_START(__get_user_4)
- LOAD_TASK_SIZE_MINUS_N(3)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ check_range size=4
ASM_STAC
3: movl (%_ASM_AX),%edx
xor %eax,%eax
@@ -90,30 +82,17 @@ SYM_FUNC_END(__get_user_4)
EXPORT_SYMBOL(__get_user_4)
SYM_FUNC_START(__get_user_8)
-#ifdef CONFIG_X86_64
- LOAD_TASK_SIZE_MINUS_N(7)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
+ check_range size=8
ASM_STAC
+#ifdef CONFIG_X86_64
4: movq (%_ASM_AX),%rdx
- xor %eax,%eax
- ASM_CLAC
- RET
#else
- LOAD_TASK_SIZE_MINUS_N(7)
- cmp %_ASM_DX,%_ASM_AX
- jae bad_get_user_8
- sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
- and %_ASM_DX, %_ASM_AX
- ASM_STAC
4: movl (%_ASM_AX),%edx
5: movl 4(%_ASM_AX),%ecx
+#endif
xor %eax,%eax
ASM_CLAC
RET
-#endif
SYM_FUNC_END(__get_user_8)
EXPORT_SYMBOL(__get_user_8)
@@ -166,7 +145,7 @@ EXPORT_SYMBOL(__get_user_nocheck_8)
SYM_CODE_START_LOCAL(.Lbad_get_user_clac)
ASM_CLAC
-bad_get_user:
+.Lbad_get_user:
xor %edx,%edx
mov $(-EFAULT),%_ASM_AX
RET
@@ -184,23 +163,23 @@ SYM_CODE_END(.Lbad_get_user_8_clac)
#endif
/* get_user */
- _ASM_EXTABLE_UA(1b, .Lbad_get_user_clac)
- _ASM_EXTABLE_UA(2b, .Lbad_get_user_clac)
- _ASM_EXTABLE_UA(3b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(1b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(2b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(3b, .Lbad_get_user_clac)
#ifdef CONFIG_X86_64
- _ASM_EXTABLE_UA(4b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(4b, .Lbad_get_user_clac)
#else
- _ASM_EXTABLE_UA(4b, .Lbad_get_user_8_clac)
- _ASM_EXTABLE_UA(5b, .Lbad_get_user_8_clac)
+ _ASM_EXTABLE(4b, .Lbad_get_user_8_clac)
+ _ASM_EXTABLE(5b, .Lbad_get_user_8_clac)
#endif
/* __get_user */
- _ASM_EXTABLE_UA(6b, .Lbad_get_user_clac)
- _ASM_EXTABLE_UA(7b, .Lbad_get_user_clac)
- _ASM_EXTABLE_UA(8b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(6b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(7b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(8b, .Lbad_get_user_clac)
#ifdef CONFIG_X86_64
- _ASM_EXTABLE_UA(9b, .Lbad_get_user_clac)
+ _ASM_EXTABLE(9b, .Lbad_get_user_clac)
#else
- _ASM_EXTABLE_UA(9b, .Lbad_get_user_8_clac)
- _ASM_EXTABLE_UA(10b, .Lbad_get_user_8_clac)
+ _ASM_EXTABLE(9b, .Lbad_get_user_8_clac)
+ _ASM_EXTABLE(10b, .Lbad_get_user_8_clac)
#endif
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index 32125224fcca..3062d09a776d 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -33,20 +33,20 @@
* as they get called from within inline assembly.
*/
-#ifdef CONFIG_X86_5LEVEL
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- ALTERNATIVE __stringify(mov $((1 << 47) - 4096 - (n)),%rbx), \
- __stringify(mov $((1 << 56) - 4096 - (n)),%rbx), X86_FEATURE_LA57
-#else
-#define LOAD_TASK_SIZE_MINUS_N(n) \
- mov $(TASK_SIZE_MAX - (n)),%_ASM_BX
-#endif
+.macro check_range size:req
+.if IS_ENABLED(CONFIG_X86_64)
+ mov %rcx, %rbx
+ sar $63, %rbx
+ or %rbx, %rcx
+.else
+ cmp $TASK_SIZE_MAX-\size+1, %ecx
+ jae .Lbad_put_user
+.endif
+.endm
.text
SYM_FUNC_START(__put_user_1)
- LOAD_TASK_SIZE_MINUS_N(0)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
+ check_range size=1
ASM_STAC
1: movb %al,(%_ASM_CX)
xor %ecx,%ecx
@@ -66,9 +66,7 @@ SYM_FUNC_END(__put_user_nocheck_1)
EXPORT_SYMBOL(__put_user_nocheck_1)
SYM_FUNC_START(__put_user_2)
- LOAD_TASK_SIZE_MINUS_N(1)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
+ check_range size=2
ASM_STAC
3: movw %ax,(%_ASM_CX)
xor %ecx,%ecx
@@ -88,9 +86,7 @@ SYM_FUNC_END(__put_user_nocheck_2)
EXPORT_SYMBOL(__put_user_nocheck_2)
SYM_FUNC_START(__put_user_4)
- LOAD_TASK_SIZE_MINUS_N(3)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
+ check_range size=4
ASM_STAC
5: movl %eax,(%_ASM_CX)
xor %ecx,%ecx
@@ -110,9 +106,7 @@ SYM_FUNC_END(__put_user_nocheck_4)
EXPORT_SYMBOL(__put_user_nocheck_4)
SYM_FUNC_START(__put_user_8)
- LOAD_TASK_SIZE_MINUS_N(7)
- cmp %_ASM_BX,%_ASM_CX
- jae .Lbad_put_user
+ check_range size=8
ASM_STAC
7: mov %_ASM_AX,(%_ASM_CX)
#ifdef CONFIG_X86_32
@@ -144,15 +138,15 @@ SYM_CODE_START_LOCAL(.Lbad_put_user_clac)
RET
SYM_CODE_END(.Lbad_put_user_clac)
- _ASM_EXTABLE_UA(1b, .Lbad_put_user_clac)
- _ASM_EXTABLE_UA(2b, .Lbad_put_user_clac)
- _ASM_EXTABLE_UA(3b, .Lbad_put_user_clac)
- _ASM_EXTABLE_UA(4b, .Lbad_put_user_clac)
- _ASM_EXTABLE_UA(5b, .Lbad_put_user_clac)
- _ASM_EXTABLE_UA(6b, .Lbad_put_user_clac)
- _ASM_EXTABLE_UA(7b, .Lbad_put_user_clac)
- _ASM_EXTABLE_UA(9b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(1b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(2b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(3b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(4b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(5b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(6b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(7b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(9b, .Lbad_put_user_clac)
#ifdef CONFIG_X86_32
- _ASM_EXTABLE_UA(8b, .Lbad_put_user_clac)
- _ASM_EXTABLE_UA(10b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(8b, .Lbad_put_user_clac)
+ _ASM_EXTABLE(10b, .Lbad_put_user_clac)
#endif
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 5f61c65322be..27ef53fab6bd 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -33,7 +33,7 @@
.align RETPOLINE_THUNK_SIZE
SYM_INNER_LABEL(__x86_indirect_thunk_\reg, SYM_L_GLOBAL)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
ALTERNATIVE_2 __stringify(RETPOLINE \reg), \
@@ -75,7 +75,7 @@ SYM_CODE_END(__x86_indirect_thunk_array)
.align RETPOLINE_THUNK_SIZE
SYM_INNER_LABEL(__x86_indirect_call_thunk_\reg, SYM_L_GLOBAL)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
CALL_DEPTH_ACCOUNT
@@ -103,7 +103,7 @@ SYM_CODE_END(__x86_indirect_call_thunk_array)
.align RETPOLINE_THUNK_SIZE
SYM_INNER_LABEL(__x86_indirect_jump_thunk_\reg, SYM_L_GLOBAL)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
POLINE \reg
ANNOTATE_UNRET_SAFE
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 6c1f8ac5e721..f515542f017f 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -136,13 +136,4 @@ void __memcpy_flushcache(void *_dst, const void *_src, size_t size)
}
}
EXPORT_SYMBOL_GPL(__memcpy_flushcache);
-
-void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
- size_t len)
-{
- char *from = kmap_atomic(page);
-
- memcpy_flushcache(to, from + offset, len);
- kunmap_atomic(from);
-}
#endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e4399983c50c..fe68119ce2cc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1118,8 +1118,22 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
(error_code & X86_PF_INSTR), foreign))
return 1;
+ /*
+ * Shadow stack accesses (PF_SHSTK=1) are only permitted to
+ * shadow stack VMAs. All other accesses result in an error.
+ */
+ if (error_code & X86_PF_SHSTK) {
+ if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK)))
+ return 1;
+ if (unlikely(!(vma->vm_flags & VM_WRITE)))
+ return 1;
+ return 0;
+ }
+
if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
+ if (unlikely(vma->vm_flags & VM_SHADOW_STACK))
+ return 1;
if (unlikely(!(vma->vm_flags & VM_WRITE)))
return 1;
return 0;
@@ -1311,6 +1325,14 @@ void do_user_addr_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+ /*
+ * Read-only permissions can not be expressed in shadow stack PTEs.
+ * Treat all shadow stack accesses as WRITE faults. This ensures
+ * that the MM will prepare everything (e.g., break COW) such that
+ * maybe_mkwrite() can create a proper shadow stack PTE.
+ */
+ if (error_code & X86_PF_SHSTK)
+ flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cb258f58fdc8..3cdac0f0055d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -806,7 +806,7 @@ void __init poking_init(void)
BUG_ON(!poking_mm);
/* Xen PV guests need the PGD to be pinned. */
- paravirt_arch_dup_mmap(NULL, poking_mm);
+ paravirt_enter_mmap(poking_mm);
/*
* Randomize the poking address, but make sure that the following page
@@ -1048,6 +1048,11 @@ __visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
+#ifdef CONFIG_ADDRESS_MASKING
+DEFINE_PER_CPU(u64, tlbstate_untag_mask);
+EXPORT_PER_CPU_SYMBOL(tlbstate_untag_mask);
+#endif
+
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
/* entry 0 MUST be WB (hardwired to speed up translations) */
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 6453fbaedb08..aa7d279321ea 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -116,6 +116,11 @@ static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *des
if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
+ if (x86_platform.hyper.is_private_mmio(addr)) {
+ desc->flags |= IORES_MAP_ENCRYPTED;
+ return;
+ }
+
if (!IS_ENABLED(CONFIG_EFI))
return;
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 9c4d8dbcb129..e0b51c09109f 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -513,10 +513,14 @@ void __init mem_encrypt_free_decrypted_mem(void)
npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
/*
- * The unused memory range was mapped decrypted, change the encryption
- * attribute from decrypted to encrypted before freeing it.
+ * If the unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it. Base the
+ * re-encryption on the same condition used for the decryption in
+ * sme_postprocess_startup(). Higher level abstractions, such as
+ * CC_ATTR_MEM_ENCRYPT, aren't necessarily equivalent in a Hyper-V VM
+ * using vTOM, where sme_me_mask is always zero.
*/
- if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+ if (sme_me_mask) {
r = set_memory_encrypted(vaddr, npages);
if (r) {
pr_warn("failed to free unused decrypted pages\n");
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index f0099ee70e02..fc627acfe40e 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2073,12 +2073,12 @@ int set_memory_nx(unsigned long addr, int numpages)
int set_memory_ro(unsigned long addr, int numpages)
{
- return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
+ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0);
}
int set_memory_rox(unsigned long addr, int numpages)
{
- pgprot_t clr = __pgprot(_PAGE_RW);
+ pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY);
if (__supported_pte_mask & _PAGE_NX)
clr.pgprot |= _PAGE_NX;
@@ -2175,9 +2175,6 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
- if (hv_is_isolation_supported())
- return hv_set_mem_host_visibility(addr, numpages, !enc);
-
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return __set_memory_enc_pgtable(addr, numpages, enc);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e4f499eb0f29..afab0bc7862b 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -880,3 +880,41 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHADOW_STACK)
+ return pte_mkwrite_shstk(pte);
+
+ pte = pte_mkwrite_kernel(pte);
+
+ if (pte_dirty(pte))
+ pte = pte_clear_saveddirty(pte);
+
+ return pte;
+}
+
+pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_SHADOW_STACK)
+ return pmd_mkwrite_shstk(pmd);
+
+ pmd = pmd_set_flags(pmd, _PAGE_RW);
+
+ if (pmd_dirty(pmd))
+ pmd = pmd_clear_saveddirty(pmd);
+
+ return pmd;
+}
+
+void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
+{
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
+ pte_shstk(pte));
+}
+
+void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
+{
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
+ pmd_shstk(pmd));
+}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 16c5292d227d..267acf27480a 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -154,26 +154,30 @@ static inline u16 user_pcid(u16 asid)
return ret;
}
-static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
{
+ unsigned long cr3 = __sme_pa(pgd) | lam;
+
if (static_cpu_has(X86_FEATURE_PCID)) {
- return __sme_pa(pgd) | kern_pcid(asid);
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+ cr3 |= kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
- return __sme_pa(pgd);
}
+
+ return cr3;
}
-static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
+ unsigned long lam)
{
- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
/*
* Use boot_cpu_has() instead of this_cpu_has() as this function
* might be called during early boot. This should work even after
* boot because all CPU's the have same capabilities:
*/
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
- return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
+ return build_cr3(pgd, asid, lam) | CR3_NOFLUSH;
}
/*
@@ -274,15 +278,16 @@ static inline void invalidate_user_asid(u16 asid)
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}
-static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
+ bool need_flush)
{
unsigned long new_mm_cr3;
if (need_flush) {
invalidate_user_asid(new_asid);
- new_mm_cr3 = build_cr3(pgdir, new_asid);
+ new_mm_cr3 = build_cr3(pgdir, new_asid, lam);
} else {
- new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam);
}
/*
@@ -491,6 +496,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
@@ -520,7 +526,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid,
+ tlbstate_lam_cr3_mask()))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
@@ -552,10 +559,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* instruction.
*/
if (real_prev == next) {
+ /* Not actually switching mm's */
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
/*
+ * If this races with another thread that enables lam, 'new_lam'
+ * might not match tlbstate_lam_cr3_mask().
+ */
+
+ /*
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
@@ -622,15 +635,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
barrier();
}
+ set_tlbstate_lam_mode(next);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
+ load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
+ load_new_mm_cr3(next->pgd, new_asid, new_lam, false);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
@@ -691,6 +705,10 @@ void initialize_tlbstate_and_flush(void)
/* Assert that CR3 already references the right mm. */
WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
+ /* LAM expected to be disabled */
+ WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
+ WARN_ON(mm_lam_cr3_mask(mm));
+
/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
* doesn't work like other CR4 bits because it can only be set from
@@ -699,8 +717,8 @@ void initialize_tlbstate_and_flush(void)
WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
!(cr4_read_shadow() & X86_CR4_PCIDE));
- /* Force ASID 0 and force a TLB flush. */
- write_cr3(build_cr3(mm->pgd, 0));
+ /* Disable LAM, force ASID 0 and force a TLB flush. */
+ write_cr3(build_cr3(mm->pgd, 0, 0));
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
@@ -708,6 +726,7 @@ void initialize_tlbstate_and_flush(void)
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
+ set_tlbstate_lam_mode(mm);
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
@@ -1071,8 +1090,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
*/
unsigned long __get_current_cr3_fast(void)
{
- unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
- this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+ unsigned long cr3 =
+ build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid),
+ tlbstate_lam_cr3_mask());
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index 7fe564eaf228..c4365a05ab83 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -50,7 +50,7 @@
#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
SYM_CODE_START_LOCAL(pvh_start_xen)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
cld
lgdt (_pa(gdt))
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 093b78c8bbec..5a034a994682 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -640,7 +640,7 @@ static struct trap_array_entry trap_array[] = {
TRAP_ENTRY(exc_coprocessor_error, false ),
TRAP_ENTRY(exc_alignment_check, false ),
TRAP_ENTRY(exc_simd_coprocessor_error, false ),
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
TRAP_ENTRY(exc_control_protection, false ),
#endif
};
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index ee29fb558f2e..fdc91deece7e 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -150,7 +150,7 @@ void make_lowmem_page_readwrite(void *vaddr)
if (pte == NULL)
return; /* vaddr missing */
- ptev = pte_mkwrite(*pte);
+ ptev = pte_mkwrite_kernel(*pte);
if (HYPERVISOR_update_va_mapping(address, ptev, 0))
BUG();
@@ -885,14 +885,7 @@ void xen_mm_unpin_all(void)
spin_unlock(&pgd_lock);
}
-static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
-{
- spin_lock(&next->page_table_lock);
- xen_pgd_pin(next);
- spin_unlock(&next->page_table_lock);
-}
-
-static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static void xen_enter_mmap(struct mm_struct *mm)
{
spin_lock(&mm->page_table_lock);
xen_pgd_pin(mm);
@@ -2153,8 +2146,7 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = {
.make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
#endif
- .activate_mm = xen_activate_mm,
- .dup_mmap = xen_dup_mmap,
+ .enter_mmap = xen_enter_mmap,
.exit_mmap = xen_exit_mmap,
.lazy_mode = {
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 4a184f6e4e4d..9e5e68008785 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -148,7 +148,7 @@ xen_pv_trap asm_exc_page_fault
xen_pv_trap asm_exc_spurious_interrupt_bug
xen_pv_trap asm_exc_coprocessor_error
xen_pv_trap asm_exc_alignment_check
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
xen_pv_trap asm_exc_control_protection
#endif
#ifdef CONFIG_X86_MCE
@@ -165,7 +165,7 @@ xen_pv_trap asm_exc_xen_hypervisor_callback
SYM_CODE_START(xen_early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ENDBR
pop %rcx
pop %r11
@@ -193,7 +193,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
* rsp->rax }
*/
SYM_CODE_START(xen_iret)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
pushq $0
jmp hypercall_iret
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index e36ea4268bd2..643d02900fbb 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -45,11 +45,11 @@ SYM_CODE_END(hypercall_page)
#ifdef CONFIG_XEN_PV
__INIT
SYM_CODE_START(startup_xen)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
cld
- mov initial_stack(%rip), %rsp
+ leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp
/* Set up %gs.
*
@@ -71,7 +71,7 @@ SYM_CODE_END(startup_xen)
#ifdef CONFIG_XEN_PV_SMP
.pushsection .text
SYM_CODE_START(asm_cpu_bringup_and_idle)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ENDBR
call cpu_bringup_and_idle
diff --git a/arch/xtensa/include/asm/cmpxchg.h b/arch/xtensa/include/asm/cmpxchg.h
index eb87810357ad..675a11ea8de7 100644
--- a/arch/xtensa/include/asm/cmpxchg.h
+++ b/arch/xtensa/include/asm/cmpxchg.h
@@ -170,7 +170,7 @@ static inline unsigned long xchg_u32(volatile int * m, unsigned long val)
}
#define arch_xchg(ptr,x) \
- ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
+ ((__typeof__(*(ptr)))__arch_xchg((unsigned long)(x),(ptr),sizeof(*(ptr))))
static inline u32 xchg_small(volatile void *ptr, u32 x, int size)
{
@@ -203,7 +203,7 @@ static inline u32 xchg_small(volatile void *ptr, u32 x, int size)
extern void __xchg_called_with_bad_pointer(void);
static __inline__ unsigned long
-__xchg(unsigned long x, volatile void * ptr, int size)
+__arch_xchg(unsigned long x, volatile void * ptr, int size)
{
switch (size) {
case 1:
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index fc7a14884c6c..d72632d9c53c 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -262,7 +262,7 @@ static inline pte_t pte_mkdirty(pte_t pte)
{ pte_val(pte) |= _PAGE_DIRTY; return pte; }
static inline pte_t pte_mkyoung(pte_t pte)
{ pte_val(pte) |= _PAGE_ACCESSED; return pte; }
-static inline pte_t pte_mkwrite(pte_t pte)
+static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{ pte_val(pte) |= _PAGE_WRITABLE; return pte; }
#define pgprot_noncached(prot) \
diff --git a/arch/xtensa/include/asm/smp.h b/arch/xtensa/include/asm/smp.h
index 4e43f5643891..5dc5bf8cdd77 100644
--- a/arch/xtensa/include/asm/smp.h
+++ b/arch/xtensa/include/asm/smp.h
@@ -33,7 +33,7 @@ void show_ipi_list(struct seq_file *p, int prec);
void __cpu_die(unsigned int cpu);
int __cpu_disable(void);
-void cpu_die(void);
+void __noreturn cpu_die(void);
void cpu_restart(void);
#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 4dc109dd6214..07dd6baf18cf 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -322,7 +322,7 @@ void __cpu_die(unsigned int cpu)
pr_err("CPU%u: unable to kill\n", cpu);
}
-void arch_cpu_idle_dead(void)
+void __noreturn arch_cpu_idle_dead(void)
{
cpu_die();
}
@@ -341,6 +341,8 @@ void __ref cpu_die(void)
__asm__ __volatile__(
" movi a2, cpu_restart\n"
" jx a2\n");
+
+ BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */
@@ -389,7 +391,7 @@ void arch_send_call_function_single_ipi(int cpu)
send_ipi_message(cpumask_of(cpu), IPI_CALL_FUNC);
}
-void smp_send_reschedule(int cpu)
+void arch_smp_send_reschedule(int cpu)
{
send_ipi_message(cpumask_of(cpu), IPI_RESCHEDULE);
}
diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c
index 0c15f01fa386..f97166fba9d9 100644
--- a/drivers/crypto/ccp/sev-dev.c
+++ b/drivers/crypto/ccp/sev-dev.c
@@ -442,12 +442,19 @@ static int __sev_init_ex_locked(int *error)
return __sev_do_cmd_locked(SEV_CMD_INIT_EX, &data, error);
}
+static inline int __sev_do_init_locked(int *psp_ret)
+{
+ if (sev_init_ex_buffer)
+ return __sev_init_ex_locked(psp_ret);
+ else
+ return __sev_init_locked(psp_ret);
+}
+
static int __sev_platform_init_locked(int *error)
{
+ int rc = 0, psp_ret = SEV_RET_NO_FW_CALL;
struct psp_device *psp = psp_master;
struct sev_device *sev;
- int rc = 0, psp_ret = -1;
- int (*init_function)(int *error);
if (!psp || !psp->sev_data)
return -ENODEV;
@@ -458,15 +465,12 @@ static int __sev_platform_init_locked(int *error)
return 0;
if (sev_init_ex_buffer) {
- init_function = __sev_init_ex_locked;
rc = sev_read_init_ex_file();
if (rc)
return rc;
- } else {
- init_function = __sev_init_locked;
}
- rc = init_function(&psp_ret);
+ rc = __sev_do_init_locked(&psp_ret);
if (rc && psp_ret == SEV_RET_SECURE_DATA_INVALID) {
/*
* Initialization command returned an integrity check failure
@@ -475,9 +479,11 @@ static int __sev_platform_init_locked(int *error)
* initialization function should succeed by replacing the state
* with a reset state.
*/
- dev_err(sev->dev, "SEV: retrying INIT command because of SECURE_DATA_INVALID error. Retrying once to reset PSP SEV state.");
- rc = init_function(&psp_ret);
+ dev_err(sev->dev,
+"SEV: retrying INIT command because of SECURE_DATA_INVALID error. Retrying once to reset PSP SEV state.");
+ rc = __sev_do_init_locked(&psp_ret);
}
+
if (error)
*error = psp_ret;
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 5c6c9a6d469c..75bad59439b8 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1043,7 +1043,7 @@ static void cleanup_status_page(struct intel_engine_cs *engine)
/* Prevent writes into HWSP after returning the page to the system */
intel_engine_set_hwsp_writemask(engine, ~0u);
- vma = fetch_and_zero(&engine->status_page.vma);
+ vma = __xchg(&engine->status_page.vma, 0);
if (!vma)
return;
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index 9a527e1f5be6..09befcc6a84f 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -229,7 +229,7 @@ unlock:
mutex_unlock(&ce->timeline->mutex);
out:
if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine))
- i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
+ i915_request_put(__xchg(&engine->heartbeat.systole, 0));
intel_engine_pm_put(engine);
}
@@ -244,7 +244,7 @@ void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
{
if (cancel_delayed_work(&engine->heartbeat.work))
- i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
+ i915_request_put(__xchg(&engine->heartbeat.systole, 0));
}
void intel_gt_unpark_heartbeats(struct intel_gt *gt)
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 750326434677..ddf35410e2cc 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -3207,7 +3207,7 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
RB_CLEAR_NODE(rb);
spin_lock(&ve->base.sched_engine->lock);
- rq = fetch_and_zero(&ve->request);
+ rq = __xchg(&ve->request, NULL);
if (rq) {
if (i915_request_mark_eio(rq)) {
rq->engine = engine;
@@ -3612,7 +3612,7 @@ static void rcu_virtual_context_destroy(struct work_struct *wrk)
spin_lock_irq(&ve->base.sched_engine->lock);
- old = fetch_and_zero(&ve->request);
+ old = __xchg(&ve->request, NULL);
if (old) {
GEM_BUG_ON(!__i915_request_is_complete(old));
__i915_request_submit(old);
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 3c7f1ed92f5b..0ac23bf61f5d 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -714,7 +714,7 @@ static void fini_aliasing_ppgtt(struct i915_ggtt *ggtt)
{
struct i915_ppgtt *ppgtt;
- ppgtt = fetch_and_zero(&ggtt->alias);
+ ppgtt = __xchg(&ggtt->alias, NULL);
if (!ppgtt)
return;
@@ -1270,7 +1270,7 @@ bool i915_ggtt_resume_vm(struct i915_address_space *vm)
was_bound);
if (obj) { /* only used during resume => exclusive access */
- write_domain_objs |= fetch_and_zero(&obj->write_domain);
+ write_domain_objs |= __xchg(&obj->write_domain, 0);
obj->read_domains |= I915_GEM_DOMAIN_GTT;
}
}
diff --git a/drivers/gpu/drm/i915/gt/intel_gsc.c b/drivers/gpu/drm/i915/gt/intel_gsc.c
index bcc3605158db..38fbea757ba7 100644
--- a/drivers/gpu/drm/i915/gt/intel_gsc.c
+++ b/drivers/gpu/drm/i915/gt/intel_gsc.c
@@ -70,7 +70,7 @@ out_put:
static void gsc_ext_om_destroy(struct intel_gsc_intf *intf)
{
- struct drm_i915_gem_object *obj = fetch_and_zero(&intf->gem_obj);
+ struct drm_i915_gem_object *obj = __xchg(&intf->gem_obj, 0);
if (!obj)
return;
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index 7a008e829d4d..52c86bd9e025 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -752,7 +752,7 @@ err_uc_init:
intel_uc_fini(&gt->uc);
err_engines:
intel_engines_release(gt);
- i915_vm_put(fetch_and_zero(&gt->vm));
+ i915_vm_put(__xchg(&gt->vm, 0));
err_pm:
intel_gt_pm_fini(gt);
intel_gt_fini_scratch(gt);
@@ -822,7 +822,7 @@ void intel_gt_driver_release(struct intel_gt *gt)
{
struct i915_address_space *vm;
- vm = fetch_and_zero(&gt->vm);
+ vm = __xchg(&gt->vm, 0);
if (vm) /* FIXME being called twice on error paths :( */
i915_vm_put(vm);
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
index e02cb90723ae..7dfb0ac58452 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c
@@ -99,7 +99,7 @@ static int __gt_unpark(struct intel_wakeref *wf)
static int __gt_park(struct intel_wakeref *wf)
{
struct intel_gt *gt = container_of(wf, typeof(*gt), wakeref);
- intel_wakeref_t wakeref = fetch_and_zero(&gt->awake);
+ intel_wakeref_t wakeref = __xchg(&gt->awake, 0);
struct drm_i915_private *i915 = gt->i915;
GT_TRACE(gt, "\n");
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 81a96c52a92b..62fbfceb7371 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1109,7 +1109,7 @@ __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
static struct intel_timeline *
pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
{
- struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
+ struct intel_timeline *tl = __xchg(&ce->timeline, 0);
return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
}
@@ -1226,8 +1226,8 @@ void lrc_fini(struct intel_context *ce)
if (!ce->state)
return;
- intel_ring_put(fetch_and_zero(&ce->ring));
- i915_vma_put(fetch_and_zero(&ce->state));
+ intel_ring_put(__xchg(&ce->ring, 0));
+ i915_vma_put(__xchg(&ce->state, 0));
}
void lrc_destroy(struct kref *kref)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 3f638f198796..3eab1867a4ab 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -1147,7 +1147,7 @@ void intel_migrate_fini(struct intel_migrate *m)
{
struct intel_context *ce;
- ce = fetch_and_zero(&m->context);
+ ce = __xchg(&m->context, 0);
if (!ce)
return;
diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c
index 8f3cd68d14f8..def413c26bb3 100644
--- a/drivers/gpu/drm/i915/gt/intel_rc6.c
+++ b/drivers/gpu/drm/i915/gt/intel_rc6.c
@@ -737,7 +737,7 @@ void intel_rc6_fini(struct intel_rc6 *rc6)
if (IS_METEORLAKE(rc6_to_i915(rc6)) && rc6->bios_state_captured)
set(uncore, GEN6_RC_STATE, rc6->bios_rc_state);
- pctx = fetch_and_zero(&rc6->pctx);
+ pctx = __xchg(&rc6->pctx, 0);
if (pctx)
i915_gem_object_put(pctx);
diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
index b2671ac59dc0..301359b500a3 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -1803,7 +1803,7 @@ static void rps_work(struct work_struct *work)
u32 pm_iir = 0;
spin_lock_irq(gt->irq_lock);
- pm_iir = fetch_and_zero(&rps->pm_iir) & rps->pm_events;
+ pm_iir = __xchg(&rps->pm_iir, 0) & rps->pm_events;
client_boost = atomic_read(&rps->num_waiters);
spin_unlock_irq(gt->irq_lock);
diff --git a/drivers/gpu/drm/i915/gt/selftest_context.c b/drivers/gpu/drm/i915/gt/selftest_context.c
index 76fbae358072..3f49ca1debc6 100644
--- a/drivers/gpu/drm/i915/gt/selftest_context.c
+++ b/drivers/gpu/drm/i915/gt/selftest_context.c
@@ -171,7 +171,7 @@ static int live_context_size(void *arg)
* active state is sufficient, we are only checking that we
* don't use more than we planned.
*/
- saved = fetch_and_zero(&engine->default_state);
+ saved = __xchg(&engine->default_state, 0);
/* Overlaps with the execlists redzone */
engine->context_size += I915_GTT_PAGE_SIZE;
diff --git a/drivers/gpu/drm/i915/gt/selftest_ring_submission.c b/drivers/gpu/drm/i915/gt/selftest_ring_submission.c
index 87ceb0f374b6..a01aaca4fbf5 100644
--- a/drivers/gpu/drm/i915/gt/selftest_ring_submission.c
+++ b/drivers/gpu/drm/i915/gt/selftest_ring_submission.c
@@ -269,7 +269,7 @@ static int live_ctx_switch_wa(void *arg)
if (IS_GRAPHICS_VER(gt->i915, 4, 5))
continue; /* MI_STORE_DWORD is privileged! */
- saved_wa = fetch_and_zero(&engine->wa_ctx.vma);
+ saved_wa = __xchg(&engine->wa_ctx.vma, 0);
intel_engine_pm_get(engine);
err = __live_ctx_switch_wa(engine);
diff --git a/drivers/gpu/drm/i915/gt/selftest_timeline.c b/drivers/gpu/drm/i915/gt/selftest_timeline.c
index 9f536c251179..9f03bdceae16 100644
--- a/drivers/gpu/drm/i915/gt/selftest_timeline.c
+++ b/drivers/gpu/drm/i915/gt/selftest_timeline.c
@@ -896,7 +896,7 @@ static int create_watcher(struct hwsp_watcher *w,
static int check_watcher(struct hwsp_watcher *w, const char *name,
bool (*op)(u32 hwsp, u32 seqno))
{
- struct i915_request *rq = fetch_and_zero(&w->rq);
+ struct i915_request *rq = __xchg(&w->rq, NULL);
u32 offset, end;
int err;
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 4ccb4be4c9cb..38a8fe171841 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -168,7 +168,7 @@ static void __uc_capture_load_err_log(struct intel_uc *uc)
static void __uc_free_load_err_log(struct intel_uc *uc)
{
- struct drm_i915_gem_object *log = fetch_and_zero(&uc->load_err_log);
+ struct drm_i915_gem_object *log = __xchg(&uc->load_err_log, NULL);
if (log)
i915_gem_object_put(log);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
index 264c952f777b..e5aca84633f7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c
@@ -1113,7 +1113,7 @@ void intel_uc_fw_cleanup_fetch(struct intel_uc_fw *uc_fw)
if (!intel_uc_fw_is_available(uc_fw))
return;
- i915_gem_object_put(fetch_and_zero(&uc_fw->obj));
+ i915_gem_object_put(__xchg(&uc_fw->obj, NULL));
intel_uc_fw_change_status(uc_fw, INTEL_UC_FIRMWARE_SELECTED);
}
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h
index 2c430c0c3bad..be7df2c384c8 100644
--- a/drivers/gpu/drm/i915/i915_utils.h
+++ b/drivers/gpu/drm/i915/i915_utils.h
@@ -26,6 +26,7 @@
#define __I915_UTILS_H
#include <linux/list.h>
+#include <linux/non-atomic/xchg.h>
#include <linux/overflow.h>
#include <linux/sched.h>
#include <linux/string_helpers.h>
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index c6692fd5ab15..2111e97c3b63 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -211,7 +211,7 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
ring_info->ring_buffer = (struct hv_ring_buffer *)
vmap_pfn(pfns_wraparound, page_cnt * 2 - 1,
- PAGE_KERNEL);
+ pgprot_decrypted(PAGE_KERNEL));
kfree(pfns_wraparound);
if (!ring_info->ring_buffer)
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index d24dd65b33d4..e9e1c4139e0d 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -2156,7 +2156,6 @@ void vmbus_device_unregister(struct hv_device *device_obj)
* VMBUS is an acpi enumerated device. Get the information we
* need from DSDT.
*/
-#define VTPM_BASE_ADDRESS 0xfed40000
static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
{
resource_size_t start = 0;
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index c434b95dc8eb..c4ab3c457fbc 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -2,6 +2,7 @@
/*
* Helpers for IOMMU drivers implementing SVA
*/
+#include <linux/mmu_context.h>
#include <linux/mutex.h>
#include <linux/sched/mm.h>
#include <linux/iommu.h>
@@ -16,14 +17,17 @@ static int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t ma
{
int ret = 0;
- if (!pasid_valid(min) || !pasid_valid(max) ||
+ if (min == IOMMU_PASID_INVALID || max == IOMMU_PASID_INVALID ||
min == 0 || max < min)
return -EINVAL;
+ if (!arch_pgtable_dma_compat(mm))
+ return -EBUSY;
+
mutex_lock(&iommu_sva_lock);
/* Is a PASID already associated with this mm? */
- if (pasid_valid(mm->pasid)) {
- if (mm->pasid < min || mm->pasid > max)
+ if (mm_valid_pasid(mm)) {
+ if (mm->pasid < min || mm->pasid >= max)
ret = -EOVERFLOW;
goto out;
}
@@ -208,7 +212,7 @@ out_put_mm:
void mm_pasid_drop(struct mm_struct *mm)
{
- if (likely(!pasid_valid(mm->pasid)))
+ if (likely(!mm_valid_pasid(mm)))
return;
ida_free(&iommu_global_pasid_ida, mm->pasid);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 493c31de0edb..3d4dd9420c30 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -580,7 +580,7 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
goto done;
}
- vaddr = untagged_addr(vaddr);
+ vaddr = untagged_addr_remote(mm, vaddr);
retry:
vma = vma_lookup(mm, vaddr);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6d07b42833be..5e0478f74bd1 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -361,8 +361,7 @@ static int vhost_worker(void *data)
kcov_remote_start_common(worker->kcov_handle);
work->fn(work);
kcov_remote_stop();
- if (need_resched())
- schedule();
+ cond_resched();
}
}
diff --git a/drivers/virt/coco/sev-guest/sev-guest.c b/drivers/virt/coco/sev-guest/sev-guest.c
index 46f1a8d558b0..97dbe715e96a 100644
--- a/drivers/virt/coco/sev-guest/sev-guest.c
+++ b/drivers/virt/coco/sev-guest/sev-guest.c
@@ -46,7 +46,15 @@ struct snp_guest_dev {
void *certs_data;
struct snp_guest_crypto *crypto;
+ /* request and response are in unencrypted memory */
struct snp_guest_msg *request, *response;
+
+ /*
+ * Avoid information leakage by double-buffering shared messages
+ * in fields that are in regular encrypted memory.
+ */
+ struct snp_guest_msg secret_request, secret_response;
+
struct snp_secrets_page_layout *layout;
struct snp_req_data input;
u32 *os_area_msg_seqno;
@@ -266,14 +274,17 @@ static int dec_payload(struct snp_guest_dev *snp_dev, struct snp_guest_msg *msg,
static int verify_and_dec_payload(struct snp_guest_dev *snp_dev, void *payload, u32 sz)
{
struct snp_guest_crypto *crypto = snp_dev->crypto;
- struct snp_guest_msg *resp = snp_dev->response;
- struct snp_guest_msg *req = snp_dev->request;
+ struct snp_guest_msg *resp = &snp_dev->secret_response;
+ struct snp_guest_msg *req = &snp_dev->secret_request;
struct snp_guest_msg_hdr *req_hdr = &req->hdr;
struct snp_guest_msg_hdr *resp_hdr = &resp->hdr;
dev_dbg(snp_dev->dev, "response [seqno %lld type %d version %d sz %d]\n",
resp_hdr->msg_seqno, resp_hdr->msg_type, resp_hdr->msg_version, resp_hdr->msg_sz);
+ /* Copy response from shared memory to encrypted memory. */
+ memcpy(resp, snp_dev->response, sizeof(*resp));
+
/* Verify that the sequence counter is incremented by 1 */
if (unlikely(resp_hdr->msg_seqno != (req_hdr->msg_seqno + 1)))
return -EBADMSG;
@@ -297,7 +308,7 @@ static int verify_and_dec_payload(struct snp_guest_dev *snp_dev, void *payload,
static int enc_payload(struct snp_guest_dev *snp_dev, u64 seqno, int version, u8 type,
void *payload, size_t sz)
{
- struct snp_guest_msg *req = snp_dev->request;
+ struct snp_guest_msg *req = &snp_dev->secret_request;
struct snp_guest_msg_hdr *hdr = &req->hdr;
memset(req, 0, sizeof(*req));
@@ -321,11 +332,12 @@ static int enc_payload(struct snp_guest_dev *snp_dev, u64 seqno, int version, u8
return __enc_payload(snp_dev, req, payload, sz);
}
-static int __handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, __u64 *fw_err)
+static int __handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code,
+ struct snp_guest_request_ioctl *rio)
{
- unsigned long err = 0xff, override_err = 0;
unsigned long req_start = jiffies;
unsigned int override_npages = 0;
+ u64 override_err = 0;
int rc;
retry_request:
@@ -335,7 +347,7 @@ retry_request:
* sequence number must be incremented or the VMPCK must be deleted to
* prevent reuse of the IV.
*/
- rc = snp_issue_guest_request(exit_code, &snp_dev->input, &err);
+ rc = snp_issue_guest_request(exit_code, &snp_dev->input, rio);
switch (rc) {
case -ENOSPC:
/*
@@ -353,7 +365,7 @@ retry_request:
* request buffer size was too small and give the caller the
* required buffer size.
*/
- override_err = SNP_GUEST_REQ_INVALID_LEN;
+ override_err = SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN);
/*
* If this call to the firmware succeeds, the sequence number can
@@ -366,7 +378,7 @@ retry_request:
goto retry_request;
/*
- * The host may return SNP_GUEST_REQ_ERR_EBUSY if the request has been
+ * The host may return SNP_GUEST_VMM_ERR_BUSY if the request has been
* throttled. Retry in the driver to avoid returning and reusing the
* message sequence number on a different message.
*/
@@ -387,27 +399,29 @@ retry_request:
*/
snp_inc_msg_seqno(snp_dev);
- if (fw_err)
- *fw_err = override_err ?: err;
+ if (override_err) {
+ rio->exitinfo2 = override_err;
+
+ /*
+ * If an extended guest request was issued and the supplied certificate
+ * buffer was not large enough, a standard guest request was issued to
+ * prevent IV reuse. If the standard request was successful, return -EIO
+ * back to the caller as would have originally been returned.
+ */
+ if (!rc && override_err == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
+ rc = -EIO;
+ }
if (override_npages)
snp_dev->input.data_npages = override_npages;
- /*
- * If an extended guest request was issued and the supplied certificate
- * buffer was not large enough, a standard guest request was issued to
- * prevent IV reuse. If the standard request was successful, return -EIO
- * back to the caller as would have originally been returned.
- */
- if (!rc && override_err == SNP_GUEST_REQ_INVALID_LEN)
- return -EIO;
-
return rc;
}
-static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, int msg_ver,
- u8 type, void *req_buf, size_t req_sz, void *resp_buf,
- u32 resp_sz, __u64 *fw_err)
+static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code,
+ struct snp_guest_request_ioctl *rio, u8 type,
+ void *req_buf, size_t req_sz, void *resp_buf,
+ u32 resp_sz)
{
u64 seqno;
int rc;
@@ -417,19 +431,31 @@ static int handle_guest_request(struct snp_guest_dev *snp_dev, u64 exit_code, in
if (!seqno)
return -EIO;
+ /* Clear shared memory's response for the host to populate. */
memset(snp_dev->response, 0, sizeof(struct snp_guest_msg));
- /* Encrypt the userspace provided payload */
- rc = enc_payload(snp_dev, seqno, msg_ver, type, req_buf, req_sz);
+ /* Encrypt the userspace provided payload in snp_dev->secret_request. */
+ rc = enc_payload(snp_dev, seqno, rio->msg_version, type, req_buf, req_sz);
if (rc)
return rc;
- rc = __handle_guest_request(snp_dev, exit_code, fw_err);
+ /*
+ * Write the fully encrypted request to the shared unencrypted
+ * request page.
+ */
+ memcpy(snp_dev->request, &snp_dev->secret_request,
+ sizeof(snp_dev->secret_request));
+
+ rc = __handle_guest_request(snp_dev, exit_code, rio);
if (rc) {
- if (rc == -EIO && *fw_err == SNP_GUEST_REQ_INVALID_LEN)
+ if (rc == -EIO &&
+ rio->exitinfo2 == SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN))
return rc;
- dev_alert(snp_dev->dev, "Detected error from ASP request. rc: %d, fw_err: %llu\n", rc, *fw_err);
+ dev_alert(snp_dev->dev,
+ "Detected error from ASP request. rc: %d, exitinfo2: 0x%llx\n",
+ rc, rio->exitinfo2);
+
snp_disable_vmpck(snp_dev);
return rc;
}
@@ -469,9 +495,9 @@ static int get_report(struct snp_guest_dev *snp_dev, struct snp_guest_request_io
if (!resp)
return -ENOMEM;
- rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg->msg_version,
+ rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg,
SNP_MSG_REPORT_REQ, &req, sizeof(req), resp->data,
- resp_len, &arg->fw_err);
+ resp_len);
if (rc)
goto e_free;
@@ -509,9 +535,8 @@ static int get_derived_key(struct snp_guest_dev *snp_dev, struct snp_guest_reque
if (copy_from_user(&req, (void __user *)arg->req_data, sizeof(req)))
return -EFAULT;
- rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg->msg_version,
- SNP_MSG_KEY_REQ, &req, sizeof(req), buf, resp_len,
- &arg->fw_err);
+ rc = handle_guest_request(snp_dev, SVM_VMGEXIT_GUEST_REQUEST, arg,
+ SNP_MSG_KEY_REQ, &req, sizeof(req), buf, resp_len);
if (rc)
return rc;
@@ -571,12 +596,12 @@ cmd:
return -ENOMEM;
snp_dev->input.data_npages = npages;
- ret = handle_guest_request(snp_dev, SVM_VMGEXIT_EXT_GUEST_REQUEST, arg->msg_version,
+ ret = handle_guest_request(snp_dev, SVM_VMGEXIT_EXT_GUEST_REQUEST, arg,
SNP_MSG_REPORT_REQ, &req.data,
- sizeof(req.data), resp->data, resp_len, &arg->fw_err);
+ sizeof(req.data), resp->data, resp_len);
/* If certs length is invalid then copy the returned length */
- if (arg->fw_err == SNP_GUEST_REQ_INVALID_LEN) {
+ if (arg->vmm_error == SNP_GUEST_VMM_ERR_INVALID_LEN) {
req.certs_len = snp_dev->input.data_npages << PAGE_SHIFT;
if (copy_to_user((void __user *)arg->req_data, &req, sizeof(req)))
@@ -611,7 +636,7 @@ static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long
if (copy_from_user(&input, argp, sizeof(input)))
return -EFAULT;
- input.fw_err = 0xff;
+ input.exitinfo2 = 0xff;
/* Message version must be non-zero */
if (!input.msg_version)
@@ -642,7 +667,7 @@ static long snp_guest_ioctl(struct file *file, unsigned int ioctl, unsigned long
mutex_unlock(&snp_cmd_mutex);
- if (input.fw_err && copy_to_user(argp, &input, sizeof(input)))
+ if (input.exitinfo2 && copy_to_user(argp, &input, sizeof(input)))
return -EFAULT;
return ret;
diff --git a/fs/aio.c b/fs/aio.c
index b0b17bd098bb..4a7576989719 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -558,7 +558,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size,
PROT_READ | PROT_WRITE,
- MAP_SHARED, 0, &unused, NULL);
+ MAP_SHARED, 0, 0, &unused, NULL);
mmap_write_unlock(mm);
if (IS_ERR((void *)ctx->mmap_base)) {
ctx->mmap_size = 0;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9b0315d34c58..a880c4e44752 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -91,6 +91,7 @@
#include <linux/user_namespace.h>
#include <linux/fs_struct.h>
#include <linux/kthread.h>
+#include <linux/mmu_context.h>
#include <asm/processor.h>
#include "internal.h"
@@ -423,6 +424,16 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm)
seq_printf(m, "THP_enabled:\t%d\n", thp_enabled);
}
+static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm)
+{
+ seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm));
+}
+
+__weak void arch_proc_pid_thread_features(struct seq_file *m,
+ struct task_struct *task)
+{
+}
+
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -438,6 +449,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
task_mem(m, mm);
task_core_dumping(m, task);
task_thp_status(m, mm);
+ task_untag_mask(m, mm);
mmput(mm);
}
task_sig(m, task);
@@ -446,6 +458,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
task_cpus_allowed(m, task);
cpuset_task_status_allowed(m, task);
task_context_switch_counts(m, task);
+ arch_proc_pid_thread_features(m, task);
return 0;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index cb49479acd2e..38b19a757281 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -711,6 +711,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
[ilog2(VM_UFFD_MINOR)] = "ui",
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ [ilog2(VM_SHADOW_STACK)] = "ss",
+#endif
};
size_t i;
@@ -1688,8 +1691,13 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
/* watch out for wraparound */
start_vaddr = end_vaddr;
- if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
- start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
+ if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) {
+ ret = mmap_read_lock_killable(mm);
+ if (ret)
+ goto out_free;
+ start_vaddr = untagged_addr_remote(mm, svpfn << PAGE_SHIFT);
+ mmap_read_unlock(mm);
+ }
/* Ensure the address is inside the task */
if (start_vaddr > mm->task_size)
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index d7f6335d3999..e86c830728de 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -20,9 +20,9 @@ static inline unsigned long huge_pte_dirty(pte_t pte)
return pte_dirty(pte);
}
-static inline pte_t huge_pte_mkwrite(pte_t pte)
+static inline pte_t huge_pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
- return pte_mkwrite(pte);
+ return pte_mkwrite(pte, vma);
}
#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index 8845a2eca339..90d7f68ed39d 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -26,6 +26,8 @@
#include <asm/ptrace.h>
#include <asm/hyperv-tlfs.h>
+#define VTPM_BASE_ADDRESS 0xfed40000
+
struct ms_hyperv_info {
u32 features;
u32 priv_high;
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f83e4519c5f0..8582a7142623 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -182,7 +182,7 @@ void arch_cpu_idle(void);
void arch_cpu_idle_prepare(void);
void arch_cpu_idle_enter(void);
void arch_cpu_idle_exit(void);
-void arch_cpu_idle_dead(void);
+void __noreturn arch_cpu_idle_dead(void);
int cpu_report_state(int cpu);
int cpu_check_up_prepare(int cpu);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 54f535ff9868..0c5d4c3b59cd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1172,16 +1172,15 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream
return false;
}
-static inline bool pasid_valid(ioasid_t ioasid)
-{
- return ioasid != IOMMU_PASID_INVALID;
-}
-
#ifdef CONFIG_IOMMU_SVA
static inline void mm_pasid_init(struct mm_struct *mm)
{
mm->pasid = IOMMU_PASID_INVALID;
}
+static inline bool mm_valid_pasid(struct mm_struct *mm)
+{
+ return mm->pasid != IOMMU_PASID_INVALID;
+}
void mm_pasid_drop(struct mm_struct *mm);
struct iommu_sva *iommu_sva_bind_device(struct device *dev,
struct mm_struct *mm);
@@ -1203,6 +1202,7 @@ static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle)
return IOMMU_PASID_INVALID;
}
static inline void mm_pasid_init(struct mm_struct *mm) {}
+static inline bool mm_valid_pasid(struct mm_struct *mm) { return false; }
static inline void mm_pasid_drop(struct mm_struct *mm) {}
#endif /* CONFIG_IOMMU_SVA */
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 293e29960c6e..9b9b38e89563 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -13,6 +13,7 @@
#include <linux/ftrace.h>
#include <linux/completion.h>
#include <linux/list.h>
+#include <linux/livepatch_sched.h>
#if IS_ENABLED(CONFIG_LIVEPATCH)
diff --git a/include/linux/livepatch_sched.h b/include/linux/livepatch_sched.h
new file mode 100644
index 000000000000..013794fb5da0
--- /dev/null
+++ b/include/linux/livepatch_sched.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_LIVEPATCH_SCHED_H_
+#define _LINUX_LIVEPATCH_SCHED_H_
+
+#include <linux/jump_label.h>
+#include <linux/static_call_types.h>
+
+#ifdef CONFIG_LIVEPATCH
+
+void __klp_sched_try_switch(void);
+
+#if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+
+static __always_inline void klp_sched_try_switch(void)
+{
+ if (static_branch_unlikely(&klp_sched_try_switch_key))
+ __klp_sched_try_switch();
+}
+
+#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+#else /* !CONFIG_LIVEPATCH */
+static inline void klp_sched_try_switch(void) {}
+static inline void __klp_sched_try_switch(void) {}
+#endif /* CONFIG_LIVEPATCH */
+
+#endif /* _LINUX_LIVEPATCH_SCHED_H_ */
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 85bda2d02d65..4dc1d185ea98 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -50,6 +50,7 @@
#include <linux/atomic.h>
#include <linux/container_of.h>
+#include <linux/non-atomic/xchg.h>
#include <linux/stddef.h>
#include <linux/types.h>
@@ -241,10 +242,7 @@ static inline struct llist_node *llist_del_all(struct llist_head *head)
static inline struct llist_node *__llist_del_all(struct llist_head *head)
{
- struct llist_node *first = head->first;
-
- head->first = NULL;
- return first;
+ return __xchg(&head->first, NULL);
}
extern struct llist_node *llist_del_first(struct llist_head *head);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1a9de1484cc3..daa25506b853 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -98,17 +98,6 @@ extern int mmap_rnd_compat_bits __read_mostly;
#include <asm/page.h>
#include <asm/processor.h>
-/*
- * Architectures that support memory tagging (assigning tags to memory regions,
- * embedding these tags into addresses that point to these memory regions, and
- * checking that the memory and the pointer tags match on memory accesses)
- * redefine this macro to strip tags from pointers.
- * It's defined as noop for architectures that don't support memory tagging.
- */
-#ifndef untagged_addr
-#define untagged_addr(addr) (addr)
-#endif
-
#ifndef __pa_symbol
#define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0))
#endif
@@ -330,11 +319,13 @@ extern unsigned int kobjsize(const void *objp);
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -350,6 +341,41 @@ extern unsigned int kobjsize(const void *objp);
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+/*
+ * This flag should not be set with VM_SHARED because of lack of support
+ * core mm. It will also get a guard page. This helps userspace protect
+ * itself from attacks. The reasoning is as follows:
+ *
+ * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
+ * INCSSP instruction can increment the shadow stack pointer. It is the
+ * shadow stack analog of an instruction like:
+ *
+ * addq $0x80, %rsp
+ *
+ * However, there is one important difference between an ADD on %rsp
+ * and INCSSP. In addition to modifying SSP, INCSSP also reads from the
+ * memory of the first and last elements that were "popped". It can be
+ * thought of as acting like this:
+ *
+ * READ_ONCE(ssp); // read+discard top element on stack
+ * ssp += nr_to_pop * 8; // move the shadow stack
+ * READ_ONCE(ssp-8); // read+discard last popped stack element
+ *
+ * The maximum distance INCSSP can move the SSP is 2040 bytes, before
+ * it would read the memory. Therefore a single page gap will be enough
+ * to prevent any operation from shifting the SSP to an adjacent stack,
+ * since it would have to land in the gap at least once, causing a
+ * fault.
+ *
+ * Prevent using INCSSP to move the SSP between shadow stacks by
+ * having a PAGE_SIZE guard gap.
+ */
+# define VM_SHADOW_STACK VM_HIGH_ARCH_5
+#else
+# define VM_SHADOW_STACK VM_NONE
+#endif
+
#if defined(CONFIG_X86)
# define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */
#elif defined(CONFIG_PPC)
@@ -381,7 +407,7 @@ extern unsigned int kobjsize(const void *objp);
#endif
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
-# define VM_UFFD_MINOR_BIT 37
+# define VM_UFFD_MINOR_BIT 38
# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */
#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
# define VM_UFFD_MINOR VM_NONE
@@ -1295,7 +1321,7 @@ void free_compound_page(struct page *page);
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
- pte = pte_mkwrite(pte);
+ pte = pte_mkwrite(pte, vma);
return pte;
}
@@ -3145,7 +3171,8 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr,
struct list_head *uf);
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
- unsigned long pgoff, unsigned long *populate, struct list_head *uf);
+ vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
+ struct list_head *uf);
extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool downgrade);
@@ -3238,15 +3265,26 @@ struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
return mtree_load(&mm->mm_mt, addr);
}
+static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_GROWSDOWN)
+ return stack_guard_gap;
+
+ /* See reasoning around the VM_SHADOW_STACK definition */
+ if (vma->vm_flags & VM_SHADOW_STACK)
+ return PAGE_SIZE;
+
+ return 0;
+}
+
static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
{
+ unsigned long gap = stack_guard_start_gap(vma);
unsigned long vm_start = vma->vm_start;
- if (vma->vm_flags & VM_GROWSDOWN) {
- vm_start -= stack_guard_gap;
- if (vm_start > vma->vm_start)
- vm_start = 0;
- }
+ vm_start -= gap;
+ if (vm_start > vma->vm_start)
+ vm_start = 0;
return vm_start;
}
diff --git a/include/linux/mman.h b/include/linux/mman.h
index cee1e4b566d8..40d94411d492 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -15,6 +15,9 @@
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif
+#ifndef MAP_ABOVE4G
+#define MAP_ABOVE4G 0
+#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB 0
#endif
@@ -50,6 +53,7 @@
| MAP_STACK \
| MAP_HUGETLB \
| MAP_32BIT \
+ | MAP_ABOVE4G \
| MAP_HUGE_2MB \
| MAP_HUGE_1GB)
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index b9b970f7ab45..f2b7a3f04099 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -28,4 +28,18 @@ static inline void leave_mm(int cpu) { }
# define task_cpu_possible(cpu, p) cpumask_test_cpu((cpu), task_cpu_possible_mask(p))
#endif
+#ifndef mm_untag_mask
+static inline unsigned long mm_untag_mask(struct mm_struct *mm)
+{
+ return -1UL;
+}
+#endif
+
+#ifndef arch_pgtable_dma_compat
+static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
+{
+ return true;
+}
+#endif
+
#endif
diff --git a/include/linux/non-atomic/xchg.h b/include/linux/non-atomic/xchg.h
new file mode 100644
index 000000000000..f7fa5dd746f3
--- /dev/null
+++ b/include/linux/non-atomic/xchg.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_NON_ATOMIC_XCHG_H
+#define _LINUX_NON_ATOMIC_XCHG_H
+
+/**
+ * __xchg - set variable pointed by @ptr to @val, return old value
+ * @ptr: pointer to affected variable
+ * @val: value to be written
+ *
+ * This is non-atomic variant of xchg.
+ */
+#define __xchg(ptr, val) ({ \
+ __auto_type __ptr = ptr; \
+ __auto_type __t = *__ptr; \
+ *__ptr = (val); \
+ __t; \
+})
+
+#endif
diff --git a/include/linux/objtool.h b/include/linux/objtool.h
index 9ac3df3fccf0..03f82c2c2ebf 100644
--- a/include/linux/objtool.h
+++ b/include/linux/objtool.h
@@ -2,47 +2,7 @@
#ifndef _LINUX_OBJTOOL_H
#define _LINUX_OBJTOOL_H
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-
-/*
- * This struct is used by asm and inline asm code to manually annotate the
- * location of registers on the stack.
- */
-struct unwind_hint {
- u32 ip;
- s16 sp_offset;
- u8 sp_reg;
- u8 type;
- u8 signal;
- u8 end;
-};
-#endif
-
-/*
- * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP
- * (the caller's SP right before it made the call). Used for all callable
- * functions, i.e. all C code and all callable asm functions.
- *
- * UNWIND_HINT_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset
- * points to a fully populated pt_regs from a syscall, interrupt, or exception.
- *
- * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
- * sp_reg+sp_offset points to the iret return frame.
- *
- * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
- * Useful for code which doesn't have an ELF function annotation.
- *
- * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc.
- */
-#define UNWIND_HINT_TYPE_CALL 0
-#define UNWIND_HINT_TYPE_REGS 1
-#define UNWIND_HINT_TYPE_REGS_PARTIAL 2
-#define UNWIND_HINT_TYPE_FUNC 3
-#define UNWIND_HINT_TYPE_ENTRY 4
-#define UNWIND_HINT_TYPE_SAVE 5
-#define UNWIND_HINT_TYPE_RESTORE 6
+#include <linux/objtool_types.h>
#ifdef CONFIG_OBJTOOL
@@ -50,7 +10,7 @@ struct unwind_hint {
#ifndef __ASSEMBLY__
-#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
+#define UNWIND_HINT(type, sp_reg, sp_offset, signal) \
"987: \n\t" \
".pushsection .discard.unwind_hints\n\t" \
/* struct unwind_hint */ \
@@ -59,7 +19,6 @@ struct unwind_hint {
".byte " __stringify(sp_reg) "\n\t" \
".byte " __stringify(type) "\n\t" \
".byte " __stringify(signal) "\n\t" \
- ".byte " __stringify(end) "\n\t" \
".balign 4 \n\t" \
".popsection\n\t"
@@ -89,7 +48,7 @@ struct unwind_hint {
#define ANNOTATE_NOENDBR \
"986: \n\t" \
".pushsection .discard.noendbr\n\t" \
- _ASM_PTR " 986b\n\t" \
+ ".long 986b - .\n\t" \
".popsection\n\t"
#define ASM_REACHABLE \
@@ -107,7 +66,7 @@ struct unwind_hint {
#define ANNOTATE_INTRA_FUNCTION_CALL \
999: \
.pushsection .discard.intra_function_calls; \
- .long 999b; \
+ .long 999b - .; \
.popsection;
/*
@@ -131,23 +90,22 @@ struct unwind_hint {
* the debuginfo as necessary. It will also warn if it sees any
* inconsistencies.
*/
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
-.Lunwind_hint_ip_\@:
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0
+.Lhere_\@:
.pushsection .discard.unwind_hints
/* struct unwind_hint */
- .long .Lunwind_hint_ip_\@ - .
+ .long .Lhere_\@ - .
.short \sp_offset
.byte \sp_reg
.byte \type
.byte \signal
- .byte \end
.balign 4
.popsection
.endm
.macro STACK_FRAME_NON_STANDARD func:req
.pushsection .discard.func_stack_frame_non_standard, "aw"
- _ASM_PTR \func
+ .long \func - .
.popsection
.endm
@@ -160,8 +118,24 @@ struct unwind_hint {
.macro ANNOTATE_NOENDBR
.Lhere_\@:
.pushsection .discard.noendbr
- .quad .Lhere_\@
+ .long .Lhere_\@ - .
+ .popsection
+.endm
+
+/*
+ * Use objtool to validate the entry requirement that all code paths do
+ * VALIDATE_UNRET_END before RET.
+ *
+ * NOTE: The macro must be used at the beginning of a global symbol, otherwise
+ * it will be ignored.
+ */
+.macro VALIDATE_UNRET_BEGIN
+#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY)
+.Lhere_\@:
+ .pushsection .discard.validate_unret
+ .long .Lhere_\@ - .
.popsection
+#endif
.endm
.macro REACHABLE
@@ -177,15 +151,14 @@ struct unwind_hint {
#ifndef __ASSEMBLY__
-#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
- "\n\t"
+#define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t"
#define STACK_FRAME_NON_STANDARD(func)
#define STACK_FRAME_NON_STANDARD_FP(func)
#define ANNOTATE_NOENDBR
#define ASM_REACHABLE
#else
#define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
+.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0
.endm
.macro STACK_FRAME_NON_STANDARD func:req
.endm
diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h
new file mode 100644
index 000000000000..453a4f4ef39d
--- /dev/null
+++ b/include/linux/objtool_types.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_OBJTOOL_TYPES_H
+#define _LINUX_OBJTOOL_TYPES_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack.
+ */
+struct unwind_hint {
+ u32 ip;
+ s16 sp_offset;
+ u8 sp_reg;
+ u8 type;
+ u8 signal;
+};
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * UNWIND_HINT_TYPE_UNDEFINED: A blind spot in ORC coverage which can result in
+ * a truncated and unreliable stack unwind.
+ *
+ * UNWIND_HINT_TYPE_END_OF_STACK: The end of the kernel stack unwind before
+ * hitting user entry, boot code, or fork entry (when there are no pt_regs
+ * available).
+ *
+ * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP
+ * (the caller's SP right before it made the call). Used for all callable
+ * functions, i.e. all C code and all callable asm functions.
+ *
+ * UNWIND_HINT_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset
+ * points to a fully populated pt_regs from a syscall, interrupt, or exception.
+ *
+ * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
+ * sp_reg+sp_offset points to the iret return frame.
+ *
+ * UNWIND_HINT_TYPE_FUNC: Generate the unwind metadata of a callable function.
+ * Useful for code which doesn't have an ELF function annotation.
+ *
+ * UNWIND_HINT_TYPE_{SAVE,RESTORE}: Save the unwind metadata at a certain
+ * location so that it can be restored later.
+ */
+#define UNWIND_HINT_TYPE_UNDEFINED 0
+#define UNWIND_HINT_TYPE_END_OF_STACK 1
+#define UNWIND_HINT_TYPE_CALL 2
+#define UNWIND_HINT_TYPE_REGS 3
+#define UNWIND_HINT_TYPE_REGS_PARTIAL 4
+/* The below hint types don't have corresponding ORC types */
+#define UNWIND_HINT_TYPE_FUNC 5
+#define UNWIND_HINT_TYPE_SAVE 6
+#define UNWIND_HINT_TYPE_RESTORE 7
+
+#endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c5a51481bbb9..023918666dd4 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -291,6 +291,20 @@ static inline bool arch_has_hw_pte_young(void)
}
#endif
+#ifndef arch_check_zapped_pte
+static inline void arch_check_zapped_pte(struct vm_area_struct *vma,
+ pte_t pte)
+{
+}
+#endif
+
+#ifndef arch_check_zapped_pmd
+static inline void arch_check_zapped_pmd(struct vm_area_struct *vma,
+ pmd_t pmd)
+{
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address,
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 0260f5ea98fe..80ff8e533cbd 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -158,6 +158,8 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
+void arch_proc_pid_thread_features(struct seq_file *m, struct task_struct *task);
+
#else /* CONFIG_PROC_FS */
static inline void proc_root_init(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3f5395ae86bc..07836629e3b5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -36,6 +36,7 @@
#include <linux/seqlock.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
+#include <linux/livepatch_sched.h>
#include <asm/kmap_size.h>
/* task_struct member predeclarations (sorted alphabetically): */
@@ -2067,6 +2068,9 @@ extern int __cond_resched(void);
#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+void sched_dynamic_klp_enable(void);
+void sched_dynamic_klp_disable(void);
+
DECLARE_STATIC_CALL(cond_resched, __cond_resched);
static __always_inline int _cond_resched(void)
@@ -2075,6 +2079,7 @@ static __always_inline int _cond_resched(void)
}
#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+
extern int dynamic_cond_resched(void);
static __always_inline int _cond_resched(void)
@@ -2082,20 +2087,25 @@ static __always_inline int _cond_resched(void)
return dynamic_cond_resched();
}
-#else
+#else /* !CONFIG_PREEMPTION */
static inline int _cond_resched(void)
{
+ klp_sched_try_switch();
return __cond_resched();
}
-#endif /* CONFIG_PREEMPT_DYNAMIC */
+#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
-#else
+#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
-static inline int _cond_resched(void) { return 0; }
+static inline int _cond_resched(void)
+{
+ klp_sched_try_switch();
+ return 0;
+}
-#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
+#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
#define cond_resched() ({ \
__might_resched(__FILE__, __LINE__, 0); \
diff --git a/include/linux/smp.h b/include/linux/smp.h
index a80ab58ae3f1..ed8f344ba627 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -125,8 +125,15 @@ extern void smp_send_stop(void);
/*
* sends a 'reschedule' event to another CPU:
*/
-extern void smp_send_reschedule(int cpu);
-
+extern void arch_smp_send_reschedule(int cpu);
+/*
+ * scheduler_ipi() is inline so can't be passed as callback reason, but the
+ * callsite IP should be sufficient for root-causing IPIs sent from here.
+ */
+#define smp_send_reschedule(cpu) ({ \
+ trace_ipi_send_cpu(cpu, _RET_IP_, NULL); \
+ arch_smp_send_reschedule(cpu); \
+})
/*
* Prepare machine for booting other CPUs.
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 33a0ee3bcb2e..392dc11e3556 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1058,6 +1058,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
unsigned long home_node,
unsigned long flags);
+asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags);
/*
* Architecture-specific system calls
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index ab9728138ad6..3064314f4832 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -11,6 +11,28 @@
#include <asm/uaccess.h>
/*
+ * Architectures that support memory tagging (assigning tags to memory regions,
+ * embedding these tags into addresses that point to these memory regions, and
+ * checking that the memory and the pointer tags match on memory accesses)
+ * redefine this macro to strip tags from pointers.
+ *
+ * Passing down mm_struct allows to define untagging rules on per-process
+ * basis.
+ *
+ * It's defined as noop for architectures that don't support memory tagging.
+ */
+#ifndef untagged_addr
+#define untagged_addr(addr) (addr)
+#endif
+
+#ifndef untagged_addr_remote
+#define untagged_addr_remote(mm, addr) ({ \
+ mmap_assert_locked(mm); \
+ untagged_addr(addr); \
+})
+#endif
+
+/*
* Architectures should provide two primitives (raw_copy_{to,from}_user())
* and get rid of their private instances of copy_{to,from}_user() and
* __copy_{to,from}_user{,_inatomic}().
diff --git a/include/trace/events/ipi.h b/include/trace/events/ipi.h
index 0be71dad6ec0..3de9bfc982ce 100644
--- a/include/trace/events/ipi.h
+++ b/include/trace/events/ipi.h
@@ -35,6 +35,50 @@ TRACE_EVENT(ipi_raise,
TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason)
);
+TRACE_EVENT(ipi_send_cpu,
+
+ TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback),
+
+ TP_ARGS(cpu, callsite, callback),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, cpu)
+ __field(void *, callsite)
+ __field(void *, callback)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ __entry->callsite = (void *)callsite;
+ __entry->callback = callback;
+ ),
+
+ TP_printk("cpu=%u callsite=%pS callback=%pS",
+ __entry->cpu, __entry->callsite, __entry->callback)
+);
+
+TRACE_EVENT(ipi_send_cpumask,
+
+ TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback),
+
+ TP_ARGS(cpumask, callsite, callback),
+
+ TP_STRUCT__entry(
+ __cpumask(cpumask)
+ __field(void *, callsite)
+ __field(void *, callback)
+ ),
+
+ TP_fast_assign(
+ __assign_cpumask(cpumask, cpumask_bits(cpumask));
+ __entry->callsite = (void *)callsite;
+ __entry->callback = callback;
+ ),
+
+ TP_printk("cpumask=%s callsite=%pS callback=%pS",
+ __get_cpumask(cpumask), __entry->callsite, __entry->callback)
+);
+
DECLARE_EVENT_CLASS(ipi_handler,
TP_PROTO(const char *reason),
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index ffbe4cec9f32..0f52d0ac47c5 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -242,7 +242,8 @@ typedef struct siginfo {
#define SEGV_ADIPERR 7 /* Precise MCD exception */
#define SEGV_MTEAERR 8 /* Asynchronous ARM MTE error */
#define SEGV_MTESERR 9 /* Synchronous ARM MTE exception */
-#define NSIGSEGV 9
+#define SEGV_CPERR 10 /* Control protection fault */
+#define NSIGSEGV 10
/*
* SIGBUS si_codes
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 45fa180cc56a..b12940ec5926 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -887,7 +887,7 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
#undef __NR_syscalls
-#define __NR_syscalls 451
+#define __NR_syscalls 452
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index ac3da855fb19..fa1ceeae2596 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -406,6 +406,8 @@ typedef struct elf64_shdr {
#define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */
#define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */
#define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */
+/* Old binutils treats 0x203 as a CET state */
+#define NT_X86_SHSTK 0x204 /* x86 SHSTK state */
#define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */
#define NT_S390_TIMER 0x301 /* s390 timer register */
#define NT_S390_TODCMP 0x302 /* s390 TOD clock comparator register */
diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h
index 91b4c63d5cbf..1c9da485318f 100644
--- a/include/uapi/linux/psp-sev.h
+++ b/include/uapi/linux/psp-sev.h
@@ -36,6 +36,13 @@ enum {
* SEV Firmware status code
*/
typedef enum {
+ /*
+ * This error code is not in the SEV spec. Its purpose is to convey that
+ * there was an error that prevented the SEV firmware from being called.
+ * The SEV API error codes are 16 bits, so the -1 value will not overlap
+ * with possible values from the specification.
+ */
+ SEV_RET_NO_FW_CALL = -1,
SEV_RET_SUCCESS = 0,
SEV_RET_INVALID_PLATFORM_STATE,
SEV_RET_INVALID_GUEST_STATE,
diff --git a/include/uapi/linux/sev-guest.h b/include/uapi/linux/sev-guest.h
index 256aaeff7e65..2aa39112cf8d 100644
--- a/include/uapi/linux/sev-guest.h
+++ b/include/uapi/linux/sev-guest.h
@@ -52,8 +52,14 @@ struct snp_guest_request_ioctl {
__u64 req_data;
__u64 resp_data;
- /* firmware error code on failure (see psp-sev.h) */
- __u64 fw_err;
+ /* bits[63:32]: VMM error code, bits[31:0] firmware error code (see psp-sev.h) */
+ union {
+ __u64 exitinfo2;
+ struct {
+ __u32 fw_error;
+ __u32 vmm_error;
+ };
+ };
};
struct snp_ext_report_req {
@@ -77,4 +83,12 @@ struct snp_ext_report_req {
/* Get SNP extended report as defined in the GHCB specification version 2. */
#define SNP_GET_EXT_REPORT _IOWR(SNP_GUEST_REQ_IOC_TYPE, 0x2, struct snp_guest_request_ioctl)
+/* Guest message request EXIT_INFO_2 constants */
+#define SNP_GUEST_FW_ERR_MASK GENMASK_ULL(31, 0)
+#define SNP_GUEST_VMM_ERR_SHIFT 32
+#define SNP_GUEST_VMM_ERR(x) (((u64)x) << SNP_GUEST_VMM_ERR_SHIFT)
+
+#define SNP_GUEST_VMM_ERR_INVALID_LEN 1
+#define SNP_GUEST_VMM_ERR_BUSY 2
+
#endif /* __UAPI_LINUX_SEV_GUEST_H_ */
diff --git a/init/main.c b/init/main.c
index c3a0602b172d..b55954043c98 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1025,14 +1025,6 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
*/
locking_selftest();
- /*
- * This needs to be called before any devices perform DMA
- * operations that might use the SWIOTLB bounce buffers. It will
- * mark the bounce buffers as decrypted so that their usage will
- * not cause "plain-text" data to be decrypted when accessed.
- */
- mem_encrypt_init();
-
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && !initrd_below_start_ok &&
page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
@@ -1049,6 +1041,17 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
late_time_init();
sched_clock_init();
calibrate_delay();
+
+ /*
+ * This needs to be called before any devices perform DMA
+ * operations that might use the SWIOTLB bounce buffers. It will
+ * mark the bounce buffers as decrypted so that their usage will
+ * not cause "plain-text" data to be decrypted when accessed. It
+ * must be called after late_time_init() so that Hyper-V x86/x64
+ * hypercalls work when the SWIOTLB bounce buffers are decrypted.
+ */
+ mem_encrypt_init();
+
pid_idr_init();
anon_vma_init();
#ifdef CONFIG_X86
diff --git a/ipc/shm.c b/ipc/shm.c
index 60e45e7045d4..576a543b7cff 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1662,7 +1662,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
goto invalid;
}
- addr = do_mmap(file, addr, size, prot, flags, 0, &populate, NULL);
+ addr = do_mmap(file, addr, size, prot, flags, 0, 0, &populate, NULL);
*raddr = addr;
err = 0;
if (IS_ERR_VALUE(addr))
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 7afa40fe5cc4..2f4fb336dda1 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -22,6 +22,8 @@
#include <asm/processor.h>
#include <linux/kasan.h>
+#include <trace/events/ipi.h>
+
static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
static DEFINE_PER_CPU(struct task_struct *, irq_workd);
@@ -74,6 +76,14 @@ void __weak arch_irq_work_raise(void)
*/
}
+static __always_inline void irq_work_raise(struct irq_work *work)
+{
+ if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
+ trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);
+
+ arch_irq_work_raise();
+}
+
/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
@@ -99,7 +109,7 @@ static void __irq_work_queue_local(struct irq_work *work)
/* If the work is "lazy", handle it from next tick if any */
if (!lazy_work || tick_nohz_tick_stopped())
- arch_irq_work_raise();
+ irq_work_raise(work);
}
/* Enqueue the irq work @work on the current CPU */
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 1de2c40cc378..246e2d50e197 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -33,6 +33,7 @@
*
* - klp_ftrace_handler()
* - klp_update_patch_state()
+ * - __klp_sched_try_switch()
*/
DEFINE_MUTEX(klp_mutex);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index f1b25ec581e0..e9fd83a02228 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,11 +9,14 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
+#include <linux/static_call.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
#define MAX_STACK_ENTRIES 100
+DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
+
#define STACK_ERR_BUF_SIZE 128
#define SIGNALS_TIMEOUT 15
@@ -25,6 +28,25 @@ static int klp_target_state = KLP_UNDEFINED;
static unsigned int klp_signals_cnt;
/*
+ * When a livepatch is in progress, enable klp stack checking in
+ * cond_resched(). This helps CPU-bound kthreads get patched.
+ */
+#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+#define klp_cond_resched_enable() sched_dynamic_klp_enable()
+#define klp_cond_resched_disable() sched_dynamic_klp_disable()
+
+#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+EXPORT_SYMBOL(klp_sched_try_switch_key);
+
+#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
+#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
+
+#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+/*
* This work can be performed periodically to finish patching or unpatching any
* "straggler" tasks which failed to transition in the first attempt.
*/
@@ -172,8 +194,8 @@ void klp_update_patch_state(struct task_struct *task)
* barrier (smp_rmb) for two cases:
*
* 1) Enforce the order of the TIF_PATCH_PENDING read and the
- * klp_target_state read. The corresponding write barrier is in
- * klp_init_transition().
+ * klp_target_state read. The corresponding write barriers are in
+ * klp_init_transition() and klp_reverse_transition().
*
* 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
* of func->transition, if klp_ftrace_handler() is called later on
@@ -240,12 +262,15 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
*/
static int klp_check_stack(struct task_struct *task, const char **oldname)
{
- static unsigned long entries[MAX_STACK_ENTRIES];
+ unsigned long *entries = this_cpu_ptr(klp_stack_entries);
struct klp_object *obj;
struct klp_func *func;
int ret, nr_entries;
- ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
+ /* Protect 'klp_stack_entries' */
+ lockdep_assert_preemption_disabled();
+
+ ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES);
if (ret < 0)
return -EINVAL;
nr_entries = ret;
@@ -307,7 +332,11 @@ static bool klp_try_switch_task(struct task_struct *task)
* functions. If all goes well, switch the task to the target patch
* state.
*/
- ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+ if (task == current)
+ ret = klp_check_and_switch_task(current, &old_name);
+ else
+ ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+
switch (ret) {
case 0: /* success */
break;
@@ -334,6 +363,44 @@ static bool klp_try_switch_task(struct task_struct *task)
return !ret;
}
+void __klp_sched_try_switch(void)
+{
+ if (likely(!klp_patch_pending(current)))
+ return;
+
+ /*
+ * This function is called from cond_resched() which is called in many
+ * places throughout the kernel. Using the klp_mutex here might
+ * deadlock.
+ *
+ * Instead, disable preemption to prevent racing with other callers of
+ * klp_try_switch_task(). Thanks to task_call_func() they won't be
+ * able to switch this task while it's running.
+ */
+ preempt_disable();
+
+ /*
+ * Make sure current didn't get patched between the above check and
+ * preempt_disable().
+ */
+ if (unlikely(!klp_patch_pending(current)))
+ goto out;
+
+ /*
+ * Enforce the order of the TIF_PATCH_PENDING read above and the
+ * klp_target_state read in klp_try_switch_task(). The corresponding
+ * write barriers are in klp_init_transition() and
+ * klp_reverse_transition().
+ */
+ smp_rmb();
+
+ klp_try_switch_task(current);
+
+out:
+ preempt_enable();
+}
+EXPORT_SYMBOL(__klp_sched_try_switch);
+
/*
* Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
* Kthreads with TIF_PATCH_PENDING set are woken up.
@@ -440,7 +507,8 @@ void klp_try_complete_transition(void)
return;
}
- /* we're done, now cleanup the data structures */
+ /* Done! Now cleanup the data structures. */
+ klp_cond_resched_disable();
patch = klp_transition_patch;
klp_complete_transition();
@@ -492,6 +560,8 @@ void klp_start_transition(void)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
+ klp_cond_resched_enable();
+
klp_signals_cnt = 0;
}
@@ -547,8 +617,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
* see a func in transition with a task->patch_state of KLP_UNDEFINED.
*
* Also enforce the order of the klp_target_state write and future
- * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
- * set a task->patch_state to KLP_UNDEFINED.
+ * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
+ * __klp_sched_try_switch() don't set a task->patch_state to
+ * KLP_UNDEFINED.
*/
smp_wmb();
@@ -584,14 +655,10 @@ void klp_reverse_transition(void)
klp_target_state == KLP_PATCHED ? "patching to unpatching" :
"unpatching to patching");
- klp_transition_patch->enabled = !klp_transition_patch->enabled;
-
- klp_target_state = !klp_target_state;
-
/*
* Clear all TIF_PATCH_PENDING flags to prevent races caused by
- * klp_update_patch_state() running in parallel with
- * klp_start_transition().
+ * klp_update_patch_state() or __klp_sched_try_switch() running in
+ * parallel with the reverse transition.
*/
read_lock(&tasklist_lock);
for_each_process_thread(g, task)
@@ -601,9 +668,28 @@ void klp_reverse_transition(void)
for_each_possible_cpu(cpu)
clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
- /* Let any remaining calls to klp_update_patch_state() complete */
+ /*
+ * Make sure all existing invocations of klp_update_patch_state() and
+ * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before
+ * starting the reverse transition.
+ */
klp_synchronize_transition();
+ /*
+ * All patching has stopped, now re-initialize the global variables to
+ * prepare for the reverse transition.
+ */
+ klp_transition_patch->enabled = !klp_transition_patch->enabled;
+ klp_target_state = !klp_target_state;
+
+ /*
+ * Enforce the order of the klp_target_state write and the
+ * TIF_PATCH_PENDING writes in klp_start_transition() to ensure
+ * klp_update_patch_state() and __klp_sched_try_switch() don't set
+ * task->patch_state to the wrong value.
+ */
+ smp_wmb();
+
klp_start_transition();
}
@@ -617,9 +703,9 @@ void klp_copy_process(struct task_struct *child)
* the task flag up to date with the parent here.
*
* The operation is serialized against all klp_*_transition()
- * operations by the tasklist_lock. The only exception is
- * klp_update_patch_state(current), but we cannot race with
- * that because we are current.
+ * operations by the tasklist_lock. The only exceptions are
+ * klp_update_patch_state(current) and __klp_sched_try_switch(), but we
+ * cannot race with them because we are current.
*/
if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
set_tsk_thread_flag(child, TIF_PATCH_PENDING);
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index c201aadb9301..25ec0239477c 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -72,15 +72,6 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
int ret;
raw_spin_lock_irq(&rtm->wait_lock);
- /*
- * Allow readers, as long as the writer has not completely
- * acquired the semaphore for write.
- */
- if (atomic_read(&rwb->readers) != WRITER_BIAS) {
- atomic_inc(&rwb->readers);
- raw_spin_unlock_irq(&rtm->wait_lock);
- return 0;
- }
/*
* Call into the slow lock path with the rtmutex->wait_lock
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 143e46bd2a68..94be4eebfa53 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -80,6 +80,7 @@
#define CREATE_TRACE_POINTS
#include <linux/sched/rseq_api.h>
#include <trace/events/sched.h>
+#include <trace/events/ipi.h>
#undef CREATE_TRACE_POINTS
#include "sched.h"
@@ -95,6 +96,9 @@
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
+
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
* associated with them) to allow external modules to probe them.
@@ -261,36 +265,51 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
resched_curr(rq);
}
-/*
- * Find left-most (aka, highest priority) task matching @cookie.
- */
-static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+static int sched_task_is_throttled(struct task_struct *p, int cpu)
{
- struct rb_node *node;
-
- node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
- /*
- * The idle task always matches any cookie!
- */
- if (!node)
- return idle_sched_class.pick_task(rq);
+ if (p->sched_class->task_is_throttled)
+ return p->sched_class->task_is_throttled(p, cpu);
- return __node_2_sc(node);
+ return 0;
}
static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
{
struct rb_node *node = &p->core_node;
+ int cpu = task_cpu(p);
+
+ do {
+ node = rb_next(node);
+ if (!node)
+ return NULL;
+
+ p = __node_2_sc(node);
+ if (p->core_cookie != cookie)
+ return NULL;
+
+ } while (sched_task_is_throttled(p, cpu));
+
+ return p;
+}
+
+/*
+ * Find left-most (aka, highest priority) and unthrottled task matching @cookie.
+ * If no suitable task is found, NULL will be returned.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+ struct task_struct *p;
+ struct rb_node *node;
- node = rb_next(node);
+ node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
if (!node)
return NULL;
- p = container_of(node, struct task_struct, core_node);
- if (p->core_cookie != cookie)
- return NULL;
+ p = __node_2_sc(node);
+ if (!sched_task_is_throttled(p, rq->cpu))
+ return p;
- return p;
+ return sched_core_next(p, cookie);
}
/*
@@ -3829,14 +3848,20 @@ void sched_ttwu_pending(void *arg)
rq_unlock_irqrestore(rq, &rf);
}
-void send_call_function_single_ipi(int cpu)
+/*
+ * Prepare the scene for sending an IPI for a remote smp_call
+ *
+ * Returns true if the caller can proceed with sending the IPI.
+ * Returns false otherwise.
+ */
+bool call_function_single_prep_ipi(int cpu)
{
- struct rq *rq = cpu_rq(cpu);
-
- if (!set_nr_if_polling(rq->idle))
- arch_send_call_function_single_ipi(cpu);
- else
+ if (set_nr_if_polling(cpu_rq(cpu)->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
+ return false;
+ }
+
+ return true;
}
/*
@@ -6240,7 +6265,7 @@ static bool try_steal_cookie(int this, int that)
goto unlock;
p = sched_core_find(src, cookie);
- if (p == src->idle)
+ if (!p)
goto unlock;
do {
@@ -6252,6 +6277,13 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
+ /*
+ * sched_core_find() and sched_core_next() will ensure that task @p
+ * is not throttled now, we also need to check whether the runqueue
+ * of the destination CPU is being throttled.
+ */
+ if (sched_task_is_throttled(p, this))
+ goto next;
deactivate_task(src, p, 0);
set_task_cpu(p, this);
@@ -8507,6 +8539,7 @@ EXPORT_STATIC_CALL_TRAMP(might_resched);
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)
{
+ klp_sched_try_switch();
if (!static_branch_unlikely(&sk_dynamic_cond_resched))
return 0;
return __cond_resched();
@@ -8655,13 +8688,17 @@ int sched_dynamic_mode(const char *str)
#error "Unsupported PREEMPT_DYNAMIC mechanism"
#endif
-void sched_dynamic_update(int mode)
+DEFINE_MUTEX(sched_dynamic_mutex);
+static bool klp_override;
+
+static void __sched_dynamic_update(int mode)
{
/*
* Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
* the ZERO state, which is invalid.
*/
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
@@ -8669,36 +8706,79 @@ void sched_dynamic_update(int mode)
switch (mode) {
case preempt_dynamic_none:
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: none\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: none\n");
break;
case preempt_dynamic_voluntary:
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: voluntary\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: voluntary\n");
break;
case preempt_dynamic_full:
- preempt_dynamic_disable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_disable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: full\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: full\n");
break;
}
preempt_dynamic_mode = mode;
}
+void sched_dynamic_update(int mode)
+{
+ mutex_lock(&sched_dynamic_mutex);
+ __sched_dynamic_update(mode);
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+
+static int klp_cond_resched(void)
+{
+ __klp_sched_try_switch();
+ return __cond_resched();
+}
+
+void sched_dynamic_klp_enable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = true;
+ static_call_update(cond_resched, klp_cond_resched);
+
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+void sched_dynamic_klp_disable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = false;
+ __sched_dynamic_update(preempt_dynamic_mode);
+
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
static int __init setup_preempt_mode(char *str)
{
int mode = sched_dynamic_mode(str);
@@ -10333,7 +10413,7 @@ void sched_release_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static void sched_change_group(struct task_struct *tsk)
+static struct task_group *sched_get_task_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -10345,7 +10425,13 @@ static void sched_change_group(struct task_struct *tsk)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
- tsk->sched_task_group = tg;
+
+ return tg;
+}
+
+static void sched_change_group(struct task_struct *tsk, struct task_group *group)
+{
+ tsk->sched_task_group = group;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
@@ -10366,10 +10452,19 @@ void sched_move_task(struct task_struct *tsk)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ struct task_group *group;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
+ /*
+ * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
+ * group changes.
+ */
+ group = sched_get_task_group(tsk);
+ if (group == tsk->sched_task_group)
+ goto unlock;
+
update_rq_clock(rq);
running = task_current(rq, tsk);
@@ -10380,7 +10475,7 @@ void sched_move_task(struct task_struct *tsk)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk);
+ sched_change_group(tsk, group);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -10394,6 +10489,7 @@ void sched_move_task(struct task_struct *tsk)
resched_curr(rq);
}
+unlock:
task_rq_unlock(rq, tsk, &rf);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 71b24371a6f7..4cc7e1ca066d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2704,6 +2704,13 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
#endif
}
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_dl(struct task_struct *p, int cpu)
+{
+ return p->dl.dl_throttled;
+}
+#endif
+
DEFINE_SCHED_CLASS(dl) = {
.enqueue_task = enqueue_task_dl,
@@ -2736,6 +2743,9 @@ DEFINE_SCHED_CLASS(dl) = {
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_dl,
+#endif
};
/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index db6fc9d978ae..a29ca11bead2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6016,6 +6016,10 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
cfs_b->period_timer.function = sched_cfs_period_timer;
+
+ /* Add a random offset so that timers interleave */
+ hrtimer_set_expires(&cfs_b->period_timer,
+ get_random_u32_below(cfs_b->period));
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
cfs_b->slack_started = false;
@@ -12023,6 +12027,18 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
return delta > 0;
}
+
+static int task_is_throttled_fair(struct task_struct *p, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq = task_group(p)->cfs_rq[cpu];
+#else
+ cfs_rq = &cpu_rq(cpu)->cfs;
+#endif
+ return throttled_hierarchy(cfs_rq);
+}
#else
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
#endif
@@ -12649,6 +12665,10 @@ DEFINE_SCHED_CLASS(fair) = {
.task_change_group = task_change_group_fair,
#endif
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_fair,
+#endif
+
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index e9ef66be2870..342f58a329f5 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -75,7 +75,7 @@ static noinline int __cpuidle cpu_idle_poll(void)
void __weak arch_cpu_idle_prepare(void) { }
void __weak arch_cpu_idle_enter(void) { }
void __weak arch_cpu_idle_exit(void) { }
-void __weak arch_cpu_idle_dead(void) { }
+void __weak __noreturn arch_cpu_idle_dead(void) { while (1); }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0a11f44adee5..9d67dfbf1000 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2677,6 +2677,21 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_rt(struct task_struct *p, int cpu)
+{
+ struct rt_rq *rt_rq;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq = task_group(p)->rt_rq[cpu];
+#else
+ rt_rq = &cpu_rq(cpu)->rt;
+#endif
+
+ return rt_rq_throttled(rt_rq);
+}
+#endif
+
DEFINE_SCHED_CLASS(rt) = {
.enqueue_task = enqueue_task_rt,
@@ -2710,6 +2725,10 @@ DEFINE_SCHED_CLASS(rt) = {
.update_curr = update_curr_rt,
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_rt,
+#endif
+
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3e8df6d31c1e..060616944d7a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2224,6 +2224,10 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p);
#endif
+
+#ifdef CONFIG_SCHED_CORE
+ int (*task_is_throttled)(struct task_struct *p, int cpu);
+#endif
};
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 2eb23dd0f285..21ac44428bb0 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -6,7 +6,7 @@
extern void sched_ttwu_pending(void *arg);
-extern void send_call_function_single_ipi(int cpu);
+extern bool call_function_single_prep_ipi(int cpu);
#ifdef CONFIG_SMP
extern void flush_smp_call_function_queue(void);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 051aaf65c749..6682535e37c8 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -209,8 +209,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
DEFINE_STATIC_KEY_FALSE(sched_energy_present);
static unsigned int sysctl_sched_energy_aware = 1;
-DEFINE_MUTEX(sched_energy_mutex);
-bool sched_energy_update;
+static DEFINE_MUTEX(sched_energy_mutex);
+static bool sched_energy_update;
void rebuild_sched_domains_energy(void)
{
diff --git a/kernel/smp.c b/kernel/smp.c
index 06a413987a14..ab3e5dad6cfe 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -26,68 +26,15 @@
#include <linux/sched/debug.h>
#include <linux/jump_label.h>
+#include <trace/events/ipi.h>
+
#include "smpboot.h"
#include "sched/smp.h"
#define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
-#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
-union cfd_seq_cnt {
- u64 val;
- struct {
- u64 src:16;
- u64 dst:16;
-#define CFD_SEQ_NOCPU 0xffff
- u64 type:4;
-#define CFD_SEQ_QUEUE 0
-#define CFD_SEQ_IPI 1
-#define CFD_SEQ_NOIPI 2
-#define CFD_SEQ_PING 3
-#define CFD_SEQ_PINGED 4
-#define CFD_SEQ_HANDLE 5
-#define CFD_SEQ_DEQUEUE 6
-#define CFD_SEQ_IDLE 7
-#define CFD_SEQ_GOTIPI 8
-#define CFD_SEQ_HDLEND 9
- u64 cnt:28;
- } u;
-};
-
-static char *seq_type[] = {
- [CFD_SEQ_QUEUE] = "queue",
- [CFD_SEQ_IPI] = "ipi",
- [CFD_SEQ_NOIPI] = "noipi",
- [CFD_SEQ_PING] = "ping",
- [CFD_SEQ_PINGED] = "pinged",
- [CFD_SEQ_HANDLE] = "handle",
- [CFD_SEQ_DEQUEUE] = "dequeue (src CPU 0 == empty)",
- [CFD_SEQ_IDLE] = "idle",
- [CFD_SEQ_GOTIPI] = "gotipi",
- [CFD_SEQ_HDLEND] = "hdlend (src CPU 0 == early)",
-};
-
-struct cfd_seq_local {
- u64 ping;
- u64 pinged;
- u64 handle;
- u64 dequeue;
- u64 idle;
- u64 gotipi;
- u64 hdlend;
-};
-#endif
-
-struct cfd_percpu {
- call_single_data_t csd;
-#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
- u64 seq_queue;
- u64 seq_ipi;
- u64 seq_noipi;
-#endif
-};
-
struct call_function_data {
- struct cfd_percpu __percpu *pcpu;
+ call_single_data_t __percpu *csd;
cpumask_var_t cpumask;
cpumask_var_t cpumask_ipi;
};
@@ -110,8 +57,8 @@ int smpcfd_prepare_cpu(unsigned int cpu)
free_cpumask_var(cfd->cpumask);
return -ENOMEM;
}
- cfd->pcpu = alloc_percpu(struct cfd_percpu);
- if (!cfd->pcpu) {
+ cfd->csd = alloc_percpu(call_single_data_t);
+ if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
free_cpumask_var(cfd->cpumask_ipi);
return -ENOMEM;
@@ -126,7 +73,7 @@ int smpcfd_dead_cpu(unsigned int cpu)
free_cpumask_var(cfd->cpumask);
free_cpumask_var(cfd->cpumask_ipi);
- free_percpu(cfd->pcpu);
+ free_percpu(cfd->csd);
return 0;
}
@@ -156,23 +103,49 @@ void __init call_function_init(void)
smpcfd_prepare_cpu(smp_processor_id());
}
+static __always_inline void
+send_call_function_single_ipi(int cpu)
+{
+ if (call_function_single_prep_ipi(cpu)) {
+ trace_ipi_send_cpu(cpu, _RET_IP_,
+ generic_smp_call_function_single_interrupt);
+ arch_send_call_function_single_ipi(cpu);
+ }
+}
+
+static __always_inline void
+send_call_function_ipi_mask(struct cpumask *mask)
+{
+ trace_ipi_send_cpumask(mask, _RET_IP_,
+ generic_smp_call_function_single_interrupt);
+ arch_send_call_function_ipi_mask(mask);
+}
+
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
-static DEFINE_STATIC_KEY_FALSE(csdlock_debug_enabled);
-static DEFINE_STATIC_KEY_FALSE(csdlock_debug_extended);
+static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);
+/*
+ * Parse the csdlock_debug= kernel boot parameter.
+ *
+ * If you need to restore the old "ext" value that once provided
+ * additional debugging information, reapply the following commits:
+ *
+ * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging")
+ * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging")
+ */
static int __init csdlock_debug(char *str)
{
+ int ret;
unsigned int val = 0;
- if (str && !strcmp(str, "ext")) {
- val = 1;
- static_branch_enable(&csdlock_debug_extended);
- } else
- get_option(&str, &val);
-
- if (val)
- static_branch_enable(&csdlock_debug_enabled);
+ ret = get_option(&str, &val);
+ if (ret) {
+ if (val)
+ static_branch_enable(&csdlock_debug_enabled);
+ else
+ static_branch_disable(&csdlock_debug_enabled);
+ }
return 1;
}
@@ -181,36 +154,11 @@ __setup("csdlock_debug=", csdlock_debug);
static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
static DEFINE_PER_CPU(void *, cur_csd_info);
-static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local);
static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */
module_param(csd_lock_timeout, ulong, 0444);
static atomic_t csd_bug_count = ATOMIC_INIT(0);
-static u64 cfd_seq;
-
-#define CFD_SEQ(s, d, t, c) \
- (union cfd_seq_cnt){ .u.src = s, .u.dst = d, .u.type = t, .u.cnt = c }
-
-static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type)
-{
- union cfd_seq_cnt new, old;
-
- new = CFD_SEQ(src, dst, type, 0);
-
- do {
- old.val = READ_ONCE(cfd_seq);
- new.u.cnt = old.u.cnt + 1;
- } while (cmpxchg(&cfd_seq, old.val, new.val) != old.val);
-
- return old.val;
-}
-
-#define cfd_seq_store(var, src, dst, type) \
- do { \
- if (static_branch_unlikely(&csdlock_debug_extended)) \
- var = cfd_seq_inc(src, dst, type); \
- } while (0)
/* Record current CSD work for current CPU, NULL to erase. */
static void __csd_lock_record(struct __call_single_data *csd)
@@ -244,80 +192,6 @@ static int csd_lock_wait_getcpu(struct __call_single_data *csd)
return -1;
}
-static void cfd_seq_data_add(u64 val, unsigned int src, unsigned int dst,
- unsigned int type, union cfd_seq_cnt *data,
- unsigned int *n_data, unsigned int now)
-{
- union cfd_seq_cnt new[2];
- unsigned int i, j, k;
-
- new[0].val = val;
- new[1] = CFD_SEQ(src, dst, type, new[0].u.cnt + 1);
-
- for (i = 0; i < 2; i++) {
- if (new[i].u.cnt <= now)
- new[i].u.cnt |= 0x80000000U;
- for (j = 0; j < *n_data; j++) {
- if (new[i].u.cnt == data[j].u.cnt) {
- /* Direct read value trumps generated one. */
- if (i == 0)
- data[j].val = new[i].val;
- break;
- }
- if (new[i].u.cnt < data[j].u.cnt) {
- for (k = *n_data; k > j; k--)
- data[k].val = data[k - 1].val;
- data[j].val = new[i].val;
- (*n_data)++;
- break;
- }
- }
- if (j == *n_data) {
- data[j].val = new[i].val;
- (*n_data)++;
- }
- }
-}
-
-static const char *csd_lock_get_type(unsigned int type)
-{
- return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type];
-}
-
-static void csd_lock_print_extended(struct __call_single_data *csd, int cpu)
-{
- struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu);
- unsigned int srccpu = csd->node.src;
- struct call_function_data *cfd = per_cpu_ptr(&cfd_data, srccpu);
- struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu);
- unsigned int now;
- union cfd_seq_cnt data[2 * ARRAY_SIZE(seq_type)];
- unsigned int n_data = 0, i;
-
- data[0].val = READ_ONCE(cfd_seq);
- now = data[0].u.cnt;
-
- cfd_seq_data_add(pcpu->seq_queue, srccpu, cpu, CFD_SEQ_QUEUE, data, &n_data, now);
- cfd_seq_data_add(pcpu->seq_ipi, srccpu, cpu, CFD_SEQ_IPI, data, &n_data, now);
- cfd_seq_data_add(pcpu->seq_noipi, srccpu, cpu, CFD_SEQ_NOIPI, data, &n_data, now);
-
- cfd_seq_data_add(per_cpu(cfd_seq_local.ping, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PING, data, &n_data, now);
- cfd_seq_data_add(per_cpu(cfd_seq_local.pinged, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED, data, &n_data, now);
-
- cfd_seq_data_add(seq->idle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_IDLE, data, &n_data, now);
- cfd_seq_data_add(seq->gotipi, CFD_SEQ_NOCPU, cpu, CFD_SEQ_GOTIPI, data, &n_data, now);
- cfd_seq_data_add(seq->handle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HANDLE, data, &n_data, now);
- cfd_seq_data_add(seq->dequeue, CFD_SEQ_NOCPU, cpu, CFD_SEQ_DEQUEUE, data, &n_data, now);
- cfd_seq_data_add(seq->hdlend, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HDLEND, data, &n_data, now);
-
- for (i = 0; i < n_data; i++) {
- pr_alert("\tcsd: cnt(%07x): %04x->%04x %s\n",
- data[i].u.cnt & ~0x80000000U, data[i].u.src,
- data[i].u.dst, csd_lock_get_type(data[i].u.type));
- }
- pr_alert("\tcsd: cnt now: %07x\n", now);
-}
-
/*
* Complain if too much time spent waiting. Note that only
* the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
@@ -368,8 +242,6 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
*bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
}
if (cpu >= 0) {
- if (static_branch_unlikely(&csdlock_debug_extended))
- csd_lock_print_extended(csd, cpu);
dump_cpu_task(cpu);
if (!cpu_cur_csd) {
pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
@@ -412,27 +284,7 @@ static __always_inline void csd_lock_wait(struct __call_single_data *csd)
smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
-
-static void __smp_call_single_queue_debug(int cpu, struct llist_node *node)
-{
- unsigned int this_cpu = smp_processor_id();
- struct cfd_seq_local *seq = this_cpu_ptr(&cfd_seq_local);
- struct call_function_data *cfd = this_cpu_ptr(&cfd_data);
- struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu);
-
- cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
- if (llist_add(node, &per_cpu(call_single_queue, cpu))) {
- cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI);
- cfd_seq_store(seq->ping, this_cpu, cpu, CFD_SEQ_PING);
- send_call_function_single_ipi(cpu);
- cfd_seq_store(seq->pinged, this_cpu, cpu, CFD_SEQ_PINGED);
- } else {
- cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI);
- }
-}
#else
-#define cfd_seq_store(var, src, dst, type)
-
static void csd_lock_record(struct __call_single_data *csd)
{
}
@@ -470,23 +322,29 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
void __smp_call_single_queue(int cpu, struct llist_node *node)
{
-#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
- if (static_branch_unlikely(&csdlock_debug_extended)) {
- unsigned int type;
-
- type = CSD_TYPE(container_of(node, call_single_data_t,
- node.llist));
- if (type == CSD_TYPE_SYNC || type == CSD_TYPE_ASYNC) {
- __smp_call_single_queue_debug(cpu, node);
- return;
- }
+ /*
+ * We have to check the type of the CSD before queueing it, because
+ * once queued it can have its flags cleared by
+ * flush_smp_call_function_queue()
+ * even if we haven't sent the smp_call IPI yet (e.g. the stopper
+ * executes migration_cpu_stop() on the remote CPU).
+ */
+ if (trace_ipi_send_cpu_enabled()) {
+ call_single_data_t *csd;
+ smp_call_func_t func;
+
+ csd = container_of(node, call_single_data_t, node.llist);
+ func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
+ sched_ttwu_pending : csd->func;
+
+ trace_ipi_send_cpu(cpu, _RET_IP_, func);
}
-#endif
/*
- * The list addition should be visible before sending the IPI
- * handler locks the list to pull the entry off it because of
- * normal cache coherency rules implied by spinlocks.
+ * The list addition should be visible to the target CPU when it pops
+ * the head of the list to pull the entry off it in the IPI handler
+ * because of normal cache coherency rules implied by the underlying
+ * llist ops.
*
* If IPIs can go out of order to the cache coherency protocol
* in an architecture, sufficient synchronisation should be added
@@ -541,8 +399,6 @@ static int generic_exec_single(int cpu, struct __call_single_data *csd)
*/
void generic_smp_call_function_single_interrupt(void)
{
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU,
- smp_processor_id(), CFD_SEQ_GOTIPI);
__flush_smp_call_function_queue(true);
}
@@ -570,13 +426,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
lockdep_assert_irqs_disabled();
head = this_cpu_ptr(&call_single_queue);
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->handle, CFD_SEQ_NOCPU,
- smp_processor_id(), CFD_SEQ_HANDLE);
entry = llist_del_all(head);
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->dequeue,
- /* Special meaning of source cpu: 0 == queue empty */
- entry ? CFD_SEQ_NOCPU : 0,
- smp_processor_id(), CFD_SEQ_DEQUEUE);
entry = llist_reverse_order(entry);
/* There shouldn't be any pending callbacks on an offline CPU. */
@@ -635,12 +485,8 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
}
}
- if (!entry) {
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend,
- 0, smp_processor_id(),
- CFD_SEQ_HDLEND);
+ if (!entry)
return;
- }
/*
* Second; run all !SYNC callbacks.
@@ -678,9 +524,6 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
*/
if (entry)
sched_ttwu_pending(entry);
-
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend, CFD_SEQ_NOCPU,
- smp_processor_id(), CFD_SEQ_HDLEND);
}
@@ -704,8 +547,6 @@ void flush_smp_call_function_queue(void)
if (llist_empty(this_cpu_ptr(&call_single_queue)))
return;
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU,
- smp_processor_id(), CFD_SEQ_IDLE);
local_irq_save(flags);
/* Get the already pending soft interrupts for RT enabled kernels */
was_pending = local_softirq_pending();
@@ -887,9 +728,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
int cpu, last_cpu, this_cpu = smp_processor_id();
struct call_function_data *cfd;
bool wait = scf_flags & SCF_WAIT;
+ int nr_cpus = 0, nr_queued = 0;
bool run_remote = false;
bool run_local = false;
- int nr_cpus = 0;
lockdep_assert_preemption_disabled();
@@ -929,11 +770,12 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
- struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu);
- call_single_data_t *csd = &pcpu->csd;
+ call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
- if (cond_func && !cond_func(cpu, info))
+ if (cond_func && !cond_func(cpu, info)) {
+ __cpumask_clear_cpu(cpu, cfd->cpumask);
continue;
+ }
csd_lock(csd);
if (wait)
@@ -944,19 +786,20 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd->node.src = smp_processor_id();
csd->node.dst = cpu;
#endif
- cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
nr_cpus++;
last_cpu = cpu;
-
- cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI);
- } else {
- cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI);
}
+ nr_queued++;
}
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->ping, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PING);
+ /*
+ * Trace each smp_function_call_*() as an IPI, actual IPIs
+ * will be traced with func==generic_smp_call_function_single_ipi().
+ */
+ if (nr_queued)
+ trace_ipi_send_cpumask(cfd->cpumask, _RET_IP_, func);
/*
* Choose the most efficient way to send an IPI. Note that the
@@ -966,9 +809,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
if (nr_cpus == 1)
send_call_function_single_ipi(last_cpu);
else if (likely(nr_cpus > 1))
- arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
-
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->pinged, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED);
+ send_call_function_ipi_mask(cfd->cpumask_ipi);
}
if (run_local && (!cond_func || cond_func(this_cpu, info))) {
@@ -983,7 +824,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
for_each_cpu(cpu, cfd->cpumask) {
call_single_data_t *csd;
- csd = &per_cpu_ptr(cfd->pcpu, cpu)->csd;
+ csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock_wait(csd);
}
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 860b2dcf3ac4..cb9aebd34646 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -381,6 +381,7 @@ COND_SYSCALL(vm86old);
COND_SYSCALL(modify_ldt);
COND_SYSCALL(vm86);
COND_SYSCALL(kexec_file_load);
+COND_SYSCALL(map_shadow_stack);
/* s390 */
COND_SYSCALL(s390_pci_mmio_read);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 613ffd6e6c03..f0d5b82e478d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1490,6 +1490,15 @@ config CSD_LOCK_WAIT_DEBUG
include the IPI handler function currently executing (if any)
and relevant stack traces.
+config CSD_LOCK_WAIT_DEBUG_DEFAULT
+ bool "Default csd_lock_wait() debugging on at boot time"
+ depends on CSD_LOCK_WAIT_DEBUG
+ depends on 64BIT
+ default n
+ help
+ This option causes the csdlock_debug= kernel boot parameter to
+ default to 1 (basic debugging) instead of 0 (no debugging).
+
endmenu # lock debugging
config TRACE_IRQFLAGS
diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile
index e814061d6aa0..9f031eafc465 100644
--- a/lib/vdso/Makefile
+++ b/lib/vdso/Makefile
@@ -5,18 +5,13 @@ GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH))
c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c)
-# This cmd checks that the vdso library does not contain absolute relocation
+# This cmd checks that the vdso library does not contain dynamic relocations.
# It has to be called after the linking of the vdso library and requires it
# as a parameter.
#
-# $(ARCH_REL_TYPE_ABS) is defined in the arch specific makefile and corresponds
-# to the absolute relocation types printed by "objdump -R" and accepted by the
-# dynamic linker.
-ifndef ARCH_REL_TYPE_ABS
-$(error ARCH_REL_TYPE_ABS is not set)
-endif
-
+# As a workaround for some GNU ld ports which produce unneeded R_*_NONE
+# dynamic relocations, ignore R_*_NONE.
quiet_cmd_vdso_check = VDSOCHK $@
- cmd_vdso_check = if $(OBJDUMP) -R $@ | grep -E -h "$(ARCH_REL_TYPE_ABS)"; \
+ cmd_vdso_check = if $(READELF) -rW $@ | grep -v _NONE | grep -q " R_\w*_"; \
then (echo >&2 "$@: dynamic relocations are not supported"; \
rm -f $@; /bin/false); fi
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index c54177aabebd..b1f85c49ab06 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -109,10 +109,10 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
WARN_ON(!pte_same(pte, pte));
WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
- WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte))));
+ WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte), args->vma)));
WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte))));
WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte))));
- WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
+ WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte, args->vma))));
WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte))));
WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte))));
}
@@ -153,7 +153,7 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args)
pte = pte_mkclean(pte);
set_pte_at(args->mm, args->vaddr, args->ptep, pte);
flush_dcache_page(page);
- pte = pte_mkwrite(pte);
+ pte = pte_mkwrite(pte, args->vma);
pte = pte_mkdirty(pte);
ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1);
pte = ptep_get(args->ptep);
@@ -199,10 +199,10 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx)
WARN_ON(!pmd_same(pmd, pmd));
WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
- WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd))));
+ WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd), args->vma)));
WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd))));
WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd))));
- WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd))));
+ WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd, args->vma))));
WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd))));
WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd))));
/*
@@ -253,7 +253,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args)
pmd = pmd_mkclean(pmd);
set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
flush_dcache_page(page);
- pmd = pmd_mkwrite(pmd);
+ pmd = pmd_mkwrite(pmd, args->vma);
pmd = pmd_mkdirty(pmd);
pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1);
pmd = READ_ONCE(*args->pmdp);
@@ -928,8 +928,8 @@ static void __init hugetlb_basic_tests(struct pgtable_debug_args *args)
pte = mk_huge_pte(page, args->page_prot);
WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
- WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
- WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
+ WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte), args->vma)));
+ WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte, args->vma))));
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot);
diff --git a/mm/gup.c b/mm/gup.c
index 1f72a717232b..80258a640135 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -978,7 +978,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
return -EFAULT;
if (write) {
- if (!(vm_flags & VM_WRITE)) {
+ if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
/* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
@@ -1085,7 +1085,7 @@ static long __get_user_pages(struct mm_struct *mm,
if (!nr_pages)
return 0;
- start = untagged_addr(start);
+ start = untagged_addr_remote(mm, start);
VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
@@ -1259,7 +1259,7 @@ int fixup_user_fault(struct mm_struct *mm,
struct vm_area_struct *vma;
vm_fault_t ret;
- address = untagged_addr(address);
+ address = untagged_addr_remote(mm, address);
if (unlocked)
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2aacf10e95d6..935b6ab2f4ee 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -551,7 +551,7 @@ __setup("transparent_hugepage=", setup_transparent_hugepage);
pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
- pmd = pmd_mkwrite(pmd);
+ pmd = pmd_mkwrite(pmd, vma);
return pmd;
}
@@ -1572,7 +1572,7 @@ out_map:
pmd = pmd_modify(oldpmd, vma->vm_page_prot);
pmd = pmd_mkyoung(pmd);
if (writable)
- pmd = pmd_mkwrite(pmd);
+ pmd = pmd_mkwrite(pmd, vma);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
@@ -1681,6 +1681,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
*/
orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
tlb->fullmm);
+ arch_check_zapped_pmd(vma, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
@@ -1924,7 +1925,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
/* See change_pte_range(). */
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
can_change_pmd_writable(vma, addr, entry))
- entry = pmd_mkwrite(entry);
+ entry = pmd_mkwrite(entry, vma);
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a7ed17cbc84e..870238ccd662 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4904,7 +4904,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
if (writable) {
entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
- vma->vm_page_prot)));
+ vma->vm_page_prot)), vma);
} else {
entry = huge_pte_wrprotect(mk_huge_pte(page,
vma->vm_page_prot));
@@ -4920,7 +4920,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
{
pte_t entry;
- entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)));
+ entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep)), vma);
if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
update_mmu_cache(vma, address, ptep);
}
diff --git a/mm/internal.h b/mm/internal.h
index 18cda26b8a92..295eb08c6a68 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -535,14 +535,14 @@ static inline bool is_exec_mapping(vm_flags_t flags)
}
/*
- * Stack area - automatically grows in one direction
+ * Stack area (including shadow stacks)
*
* VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
* do_mmap() forbids all other combinations.
*/
static inline bool is_stack_mapping(vm_flags_t flags)
{
- return (flags & VM_STACK) == VM_STACK;
+ return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
}
/*
diff --git a/mm/madvise.c b/mm/madvise.c
index 24c5cffe3e6c..b5ffbaf616f5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1390,8 +1390,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
size_t len;
struct blk_plug plug;
- start = untagged_addr(start);
-
if (!madvise_behavior_valid(behavior))
return -EINVAL;
@@ -1423,6 +1421,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
mmap_read_lock(mm);
}
+ start = untagged_addr_remote(mm, start);
+ end = start + len;
+
blk_start_plug(&plug);
error = madvise_walk_vmas(mm, start, end, behavior,
madvise_vma_behavior);
diff --git a/mm/memory.c b/mm/memory.c
index 0a5cefea9774..42dd1ab5e4e6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1412,6 +1412,7 @@ again:
continue;
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
+ arch_check_zapped_pte(vma, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
zap_install_uffd_wp_if_needed(vma, addr, pte, details,
ptent);
@@ -4100,7 +4101,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
entry = mk_pte(&folio->page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
- entry = pte_mkwrite(pte_mkdirty(entry));
+ entry = pte_mkwrite(pte_mkdirty(entry), vma);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
@@ -4796,7 +4797,7 @@ out_map:
pte = pte_modify(old_pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (writable)
- pte = pte_mkwrite(pte);
+ pte = pte_mkwrite(pte, vma);
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
diff --git a/mm/migrate.c b/mm/migrate.c
index afe21c48dc6e..0f67f65565b2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2101,15 +2101,18 @@ static int do_move_pages_to_node(struct mm_struct *mm,
* target node
* 1 - when it has been queued
*/
-static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
+static int add_page_for_migration(struct mm_struct *mm, const void __user *p,
int node, struct list_head *pagelist, bool migrate_all)
{
struct vm_area_struct *vma;
+ unsigned long addr;
struct page *page;
int err;
bool isolated;
mmap_read_lock(mm);
+ addr = (unsigned long)untagged_addr_remote(mm, p);
+
err = -EFAULT;
vma = vma_lookup(mm, addr);
if (!vma || !vma_migratable(vma))
@@ -2215,7 +2218,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
for (i = start = 0; i < nr_pages; i++) {
const void __user *p;
- unsigned long addr;
int node;
err = -EFAULT;
@@ -2223,7 +2225,6 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
goto out_flush;
if (get_user(node, nodes + i))
goto out_flush;
- addr = (unsigned long)untagged_addr(p);
err = -ENODEV;
if (node < 0 || node >= MAX_NUMNODES)
@@ -2251,8 +2252,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
* Errors in the page lookup or isolation are not fatal and we simply
* report them via status
*/
- err = add_page_for_migration(mm, addr, current_node,
- &pagelist, flags & MPOL_MF_MOVE_ALL);
+ err = add_page_for_migration(mm, p, current_node, &pagelist,
+ flags & MPOL_MF_MOVE_ALL);
if (err > 0) {
/* The page is successfully queued for migration */
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index d30c9de60b0d..df3f5e9d5f76 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -646,7 +646,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
}
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
- entry = pte_mkwrite(pte_mkdirty(entry));
+ entry = pte_mkwrite(pte_mkdirty(entry), vma);
}
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
diff --git a/mm/mmap.c b/mm/mmap.c
index 055fbd5ed762..62ecfec42802 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1221,11 +1221,11 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
- unsigned long flags, unsigned long pgoff,
- unsigned long *populate, struct list_head *uf)
+ unsigned long flags, vm_flags_t vm_flags,
+ unsigned long pgoff, unsigned long *populate,
+ struct list_head *uf)
{
struct mm_struct *mm = current->mm;
- vm_flags_t vm_flags;
int pkey = 0;
validate_mm(mm);
@@ -1286,7 +1286,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
@@ -2860,7 +2860,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
file = get_file(vma->vm_file);
ret = do_mmap(vma->vm_file, start, size,
- prot, flags, pgoff, &populate, NULL);
+ prot, flags, 0, pgoff, &populate, NULL);
fput(file);
out:
mmap_write_unlock(mm);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 92d3d3ca390a..afdb6723782e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -198,7 +198,7 @@ static long change_pte_range(struct mmu_gather *tlb,
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
!pte_write(ptent) &&
can_change_pte_writable(vma, addr, ptent))
- ptent = pte_mkwrite(ptent);
+ ptent = pte_mkwrite(ptent, vma);
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
if (pte_needs_flush(oldpte, ptent))
diff --git a/mm/nommu.c b/mm/nommu.c
index f670d9979a26..138826c4a872 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1002,6 +1002,7 @@ unsigned long do_mmap(struct file *file,
unsigned long len,
unsigned long prot,
unsigned long flags,
+ vm_flags_t vm_flags,
unsigned long pgoff,
unsigned long *populate,
struct list_head *uf)
@@ -1009,7 +1010,6 @@ unsigned long do_mmap(struct file *file,
struct vm_area_struct *vma;
struct vm_region *region;
struct rb_node *rb;
- vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
VMA_ITERATOR(vmi, current->mm, 0);
@@ -1029,7 +1029,7 @@ unsigned long do_mmap(struct file *file,
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
- vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+ vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
/* we're going to need to record the mapping */
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index be423189120d..a675f0a9c2b9 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -76,7 +76,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
if (page_in_cache && !vm_shared)
writable = false;
if (writable)
- _dst_pte = pte_mkwrite(_dst_pte);
+ _dst_pte = pte_mkwrite(_dst_pte, dst_vma);
if (flags & MFILL_ATOMIC_WP)
_dst_pte = pte_mkuffd_wp(_dst_pte);
diff --git a/mm/util.c b/mm/util.c
index dd12b9531ac4..8e7fc6cacab4 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -540,7 +540,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
+ ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
&uf);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
diff --git a/scripts/sorttable.h b/scripts/sorttable.h
index deb7c1d3e979..7bd0184380d3 100644
--- a/scripts/sorttable.h
+++ b/scripts/sorttable.h
@@ -128,7 +128,7 @@ static int orc_sort_cmp(const void *_a, const void *_b)
* whitelisted .o files which didn't get objtool generation.
*/
orc_a = g_orc_table + (a - g_orc_ip_table);
- return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1;
+ return orc_a->type == ORC_TYPE_UNDEFINED ? -1 : 1;
}
static void *sort_orctable(void *arg)
diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h
index 1343a62106de..46d7e06763c9 100644
--- a/tools/arch/x86/include/asm/orc_types.h
+++ b/tools/arch/x86/include/asm/orc_types.h
@@ -39,6 +39,12 @@
#define ORC_REG_SP_INDIRECT 9
#define ORC_REG_MAX 15
+#define ORC_TYPE_UNDEFINED 0
+#define ORC_TYPE_END_OF_STACK 1
+#define ORC_TYPE_CALL 2
+#define ORC_TYPE_REGS 3
+#define ORC_TYPE_REGS_PARTIAL 4
+
#ifndef __ASSEMBLY__
#include <asm/byteorder.h>
@@ -56,16 +62,14 @@ struct orc_entry {
#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned sp_reg:4;
unsigned bp_reg:4;
- unsigned type:2;
+ unsigned type:3;
unsigned signal:1;
- unsigned end:1;
#elif defined(__BIG_ENDIAN_BITFIELD)
unsigned bp_reg:4;
unsigned sp_reg:4;
unsigned unused:4;
- unsigned end:1;
unsigned signal:1;
- unsigned type:2;
+ unsigned type:3;
#endif
} __packed;
diff --git a/tools/arch/x86/kcpuid/cpuid.csv b/tools/arch/x86/kcpuid/cpuid.csv
index 4f1c4b0c29e9..e0c25b75327e 100644
--- a/tools/arch/x86/kcpuid/cpuid.csv
+++ b/tools/arch/x86/kcpuid/cpuid.csv
@@ -184,8 +184,8 @@
7, 0, EBX, 27, avx512er, AVX512 Exponent Reciproca instr
7, 0, EBX, 28, avx512cd, AVX512 Conflict Detection instr
7, 0, EBX, 29, sha, Intel Secure Hash Algorithm Extensions instr
- 7, 0, EBX, 26, avx512bw, AVX512 Byte & Word instr
- 7, 0, EBX, 28, avx512vl, AVX512 Vector Length Extentions (VL)
+ 7, 0, EBX, 30, avx512bw, AVX512 Byte & Word instr
+ 7, 0, EBX, 31, avx512vl, AVX512 Vector Length Extentions (VL)
7, 0, ECX, 0, prefetchwt1, X
7, 0, ECX, 1, avx512vbmi, AVX512 Vector Byte Manipulation Instructions
7, 0, ECX, 2, umip, User-mode Instruction Prevention
@@ -340,19 +340,70 @@
# According to SDM
# 40000000H - 4FFFFFFFH is invalid range
-
# Leaf 80000001H
# Extended Processor Signature and Feature Bits
+0x80000001, 0, EAX, 27:20, extfamily, Extended family
+0x80000001, 0, EAX, 19:16, extmodel, Extended model
+0x80000001, 0, EAX, 11:8, basefamily, Description of Family
+0x80000001, 0, EAX, 11:8, basemodel, Model numbers vary with product
+0x80000001, 0, EAX, 3:0, stepping, Processor stepping (revision) for a specific model
+
+0x80000001, 0, EBX, 31:28, pkgtype, Specifies the package type
+
0x80000001, 0, ECX, 0, lahf_lm, LAHF/SAHF available in 64-bit mode
+0x80000001, 0, ECX, 1, cmplegacy, Core multi-processing legacy mode
+0x80000001, 0, ECX, 2, svm, Indicates support for: VMRUN, VMLOAD, VMSAVE, CLGI, VMMCALL, and INVLPGA
+0x80000001, 0, ECX, 3, extapicspace, Extended APIC register space
+0x80000001, 0, ECX, 4, altmovecr8, Indicates support for LOCK MOV CR0 means MOV CR8
0x80000001, 0, ECX, 5, lzcnt, LZCNT
+0x80000001, 0, ECX, 6, sse4a, EXTRQ, INSERTQ, MOVNTSS, and MOVNTSD instruction support
+0x80000001, 0, ECX, 7, misalignsse, Misaligned SSE Mode
0x80000001, 0, ECX, 8, prefetchw, PREFETCHW
-
+0x80000001, 0, ECX, 9, osvw, OS Visible Work-around support
+0x80000001, 0, ECX, 10, ibs, Instruction Based Sampling
+0x80000001, 0, ECX, 11, xop, Extended operation support
+0x80000001, 0, ECX, 12, skinit, SKINIT and STGI support
+0x80000001, 0, ECX, 13, wdt, Watchdog timer support
+0x80000001, 0, ECX, 15, lwp, Lightweight profiling support
+0x80000001, 0, ECX, 16, fma4, Four-operand FMA instruction support
+0x80000001, 0, ECX, 17, tce, Translation cache extension
+0x80000001, 0, ECX, 22, TopologyExtensions, Indicates support for Core::X86::Cpuid::CachePropEax0 and Core::X86::Cpuid::ExtApicId
+0x80000001, 0, ECX, 23, perfctrextcore, Indicates support for Core::X86::Msr::PERF_CTL0 - 5 and Core::X86::Msr::PERF_CTR
+0x80000001, 0, ECX, 24, perfctrextdf, Indicates support for Core::X86::Msr::DF_PERF_CTL and Core::X86::Msr::DF_PERF_CTR
+0x80000001, 0, ECX, 26, databreakpointextension, Indicates data breakpoint support for Core::X86::Msr::DR0_ADDR_MASK, Core::X86::Msr::DR1_ADDR_MASK, Core::X86::Msr::DR2_ADDR_MASK and Core::X86::Msr::DR3_ADDR_MASK
+0x80000001, 0, ECX, 27, perftsc, Performance time-stamp counter supported
+0x80000001, 0, ECX, 28, perfctrextllc, Indicates support for L3 performance counter extensions
+0x80000001, 0, ECX, 29, mwaitextended, MWAITX and MONITORX capability is supported
+0x80000001, 0, ECX, 30, admskextn, Indicates support for address mask extension (to 32 bits and to all 4 DRs) for instruction breakpoints
+
+0x80000001, 0, EDX, 0, fpu, x87 floating point unit on-chip
+0x80000001, 0, EDX, 1, vme, Virtual-mode enhancements
+0x80000001, 0, EDX, 2, de, Debugging extensions, IO breakpoints, CR4.DE
+0x80000001, 0, EDX, 3, pse, Page-size extensions (4 MB pages)
+0x80000001, 0, EDX, 4, tsc, Time stamp counter, RDTSC/RDTSCP instructions, CR4.TSD
+0x80000001, 0, EDX, 5, msr, Model-specific registers (MSRs), with RDMSR and WRMSR instructions
+0x80000001, 0, EDX, 6, pae, Physical-address extensions (PAE)
+0x80000001, 0, EDX, 7, mce, Machine Check Exception, CR4.MCE
+0x80000001, 0, EDX, 8, cmpxchg8b, CMPXCHG8B instruction
+0x80000001, 0, EDX, 9, apic, advanced programmable interrupt controller (APIC) exists and is enabled
0x80000001, 0, EDX, 11, sysret, SYSCALL/SYSRET supported
+0x80000001, 0, EDX, 12, mtrr, Memory-type range registers
+0x80000001, 0, EDX, 13, pge, Page global extension, CR4.PGE
+0x80000001, 0, EDX, 14, mca, Machine check architecture, MCG_CAP
+0x80000001, 0, EDX, 15, cmov, Conditional move instructions, CMOV, FCOMI, FCMOV
+0x80000001, 0, EDX, 16, pat, Page attribute table
+0x80000001, 0, EDX, 17, pse36, Page-size extensions
0x80000001, 0, EDX, 20, exec_dis, Execute Disable Bit available
+0x80000001, 0, EDX, 22, mmxext, AMD extensions to MMX instructions
+0x80000001, 0, EDX, 23, mmx, MMX instructions
+0x80000001, 0, EDX, 24, fxsr, FXSAVE and FXRSTOR instructions
+0x80000001, 0, EDX, 25, ffxsr, FXSAVE and FXRSTOR instruction optimizations
0x80000001, 0, EDX, 26, 1gb_page, 1GB page supported
0x80000001, 0, EDX, 27, rdtscp, RDTSCP and IA32_TSC_AUX are available
-#0x80000001, 0, EDX, 29, 64b, 64b Architecture supported
+0x80000001, 0, EDX, 29, lm, 64b Architecture supported
+0x80000001, 0, EDX, 30, threednowext, AMD extensions to 3DNow! instructions
+0x80000001, 0, EDX, 31, threednow, 3DNow! instructions
# Leaf 80000002H/80000003H/80000004H
# Processor Brand String
diff --git a/tools/arch/x86/kcpuid/kcpuid.c b/tools/arch/x86/kcpuid/kcpuid.c
index dae75511fef7..416f5b35dd8f 100644
--- a/tools/arch/x86/kcpuid/kcpuid.c
+++ b/tools/arch/x86/kcpuid/kcpuid.c
@@ -33,7 +33,7 @@ struct reg_desc {
struct bits_desc descs[32];
};
-enum {
+enum cpuid_reg {
R_EAX = 0,
R_EBX,
R_ECX,
@@ -41,6 +41,10 @@ enum {
NR_REGS
};
+static const char * const reg_names[] = {
+ "EAX", "EBX", "ECX", "EDX",
+};
+
struct subleaf {
u32 index;
u32 sub;
@@ -428,12 +432,18 @@ static void parse_text(void)
/* Decode every eax/ebx/ecx/edx */
-static void decode_bits(u32 value, struct reg_desc *rdesc)
+static void decode_bits(u32 value, struct reg_desc *rdesc, enum cpuid_reg reg)
{
struct bits_desc *bdesc;
int start, end, i;
u32 mask;
+ if (!rdesc->nr) {
+ if (show_details)
+ printf("\t %s: 0x%08x\n", reg_names[reg], value);
+ return;
+ }
+
for (i = 0; i < rdesc->nr; i++) {
bdesc = &rdesc->descs[i];
@@ -468,13 +478,21 @@ static void show_leaf(struct subleaf *leaf)
if (!leaf)
return;
- if (show_raw)
+ if (show_raw) {
leaf_print_raw(leaf);
+ } else {
+ if (show_details)
+ printf("CPUID_0x%x_ECX[0x%x]:\n",
+ leaf->index, leaf->sub);
+ }
+
+ decode_bits(leaf->eax, &leaf->info[R_EAX], R_EAX);
+ decode_bits(leaf->ebx, &leaf->info[R_EBX], R_EBX);
+ decode_bits(leaf->ecx, &leaf->info[R_ECX], R_ECX);
+ decode_bits(leaf->edx, &leaf->info[R_EDX], R_EDX);
- decode_bits(leaf->eax, &leaf->info[R_EAX]);
- decode_bits(leaf->ebx, &leaf->info[R_EBX]);
- decode_bits(leaf->ecx, &leaf->info[R_ECX]);
- decode_bits(leaf->edx, &leaf->info[R_EDX]);
+ if (!show_raw && show_details)
+ printf("\n");
}
static void show_func(struct cpuid_func *func)
diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h
deleted file mode 100644
index 9ac3df3fccf0..000000000000
--- a/tools/include/linux/objtool.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_OBJTOOL_H
-#define _LINUX_OBJTOOL_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-
-/*
- * This struct is used by asm and inline asm code to manually annotate the
- * location of registers on the stack.
- */
-struct unwind_hint {
- u32 ip;
- s16 sp_offset;
- u8 sp_reg;
- u8 type;
- u8 signal;
- u8 end;
-};
-#endif
-
-/*
- * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP
- * (the caller's SP right before it made the call). Used for all callable
- * functions, i.e. all C code and all callable asm functions.
- *
- * UNWIND_HINT_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset
- * points to a fully populated pt_regs from a syscall, interrupt, or exception.
- *
- * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
- * sp_reg+sp_offset points to the iret return frame.
- *
- * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function.
- * Useful for code which doesn't have an ELF function annotation.
- *
- * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc.
- */
-#define UNWIND_HINT_TYPE_CALL 0
-#define UNWIND_HINT_TYPE_REGS 1
-#define UNWIND_HINT_TYPE_REGS_PARTIAL 2
-#define UNWIND_HINT_TYPE_FUNC 3
-#define UNWIND_HINT_TYPE_ENTRY 4
-#define UNWIND_HINT_TYPE_SAVE 5
-#define UNWIND_HINT_TYPE_RESTORE 6
-
-#ifdef CONFIG_OBJTOOL
-
-#include <asm/asm.h>
-
-#ifndef __ASSEMBLY__
-
-#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
- "987: \n\t" \
- ".pushsection .discard.unwind_hints\n\t" \
- /* struct unwind_hint */ \
- ".long 987b - .\n\t" \
- ".short " __stringify(sp_offset) "\n\t" \
- ".byte " __stringify(sp_reg) "\n\t" \
- ".byte " __stringify(type) "\n\t" \
- ".byte " __stringify(signal) "\n\t" \
- ".byte " __stringify(end) "\n\t" \
- ".balign 4 \n\t" \
- ".popsection\n\t"
-
-/*
- * This macro marks the given function's stack frame as "non-standard", which
- * tells objtool to ignore the function when doing stack metadata validation.
- * It should only be used in special cases where you're 100% sure it won't
- * affect the reliability of frame pointers and kernel stack traces.
- *
- * For more information, see tools/objtool/Documentation/objtool.txt.
- */
-#define STACK_FRAME_NON_STANDARD(func) \
- static void __used __section(".discard.func_stack_frame_non_standard") \
- *__func_stack_frame_non_standard_##func = func
-
-/*
- * STACK_FRAME_NON_STANDARD_FP() is a frame-pointer-specific function ignore
- * for the case where a function is intentionally missing frame pointer setup,
- * but otherwise needs objtool/ORC coverage when frame pointers are disabled.
- */
-#ifdef CONFIG_FRAME_POINTER
-#define STACK_FRAME_NON_STANDARD_FP(func) STACK_FRAME_NON_STANDARD(func)
-#else
-#define STACK_FRAME_NON_STANDARD_FP(func)
-#endif
-
-#define ANNOTATE_NOENDBR \
- "986: \n\t" \
- ".pushsection .discard.noendbr\n\t" \
- _ASM_PTR " 986b\n\t" \
- ".popsection\n\t"
-
-#define ASM_REACHABLE \
- "998:\n\t" \
- ".pushsection .discard.reachable\n\t" \
- ".long 998b - .\n\t" \
- ".popsection\n\t"
-
-#else /* __ASSEMBLY__ */
-
-/*
- * This macro indicates that the following intra-function call is valid.
- * Any non-annotated intra-function call will cause objtool to issue a warning.
- */
-#define ANNOTATE_INTRA_FUNCTION_CALL \
- 999: \
- .pushsection .discard.intra_function_calls; \
- .long 999b; \
- .popsection;
-
-/*
- * In asm, there are two kinds of code: normal C-type callable functions and
- * the rest. The normal callable functions can be called by other code, and
- * don't do anything unusual with the stack. Such normal callable functions
- * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this
- * category. In this case, no special debugging annotations are needed because
- * objtool can automatically generate the ORC data for the ORC unwinder to read
- * at runtime.
- *
- * Anything which doesn't fall into the above category, such as syscall and
- * interrupt handlers, tends to not be called directly by other functions, and
- * often does unusual non-C-function-type things with the stack pointer. Such
- * code needs to be annotated such that objtool can understand it. The
- * following CFI hint macros are for this type of code.
- *
- * These macros provide hints to objtool about the state of the stack at each
- * instruction. Objtool starts from the hints and follows the code flow,
- * making automatic CFI adjustments when it sees pushes and pops, filling out
- * the debuginfo as necessary. It will also warn if it sees any
- * inconsistencies.
- */
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
-.Lunwind_hint_ip_\@:
- .pushsection .discard.unwind_hints
- /* struct unwind_hint */
- .long .Lunwind_hint_ip_\@ - .
- .short \sp_offset
- .byte \sp_reg
- .byte \type
- .byte \signal
- .byte \end
- .balign 4
- .popsection
-.endm
-
-.macro STACK_FRAME_NON_STANDARD func:req
- .pushsection .discard.func_stack_frame_non_standard, "aw"
- _ASM_PTR \func
- .popsection
-.endm
-
-.macro STACK_FRAME_NON_STANDARD_FP func:req
-#ifdef CONFIG_FRAME_POINTER
- STACK_FRAME_NON_STANDARD \func
-#endif
-.endm
-
-.macro ANNOTATE_NOENDBR
-.Lhere_\@:
- .pushsection .discard.noendbr
- .quad .Lhere_\@
- .popsection
-.endm
-
-.macro REACHABLE
-.Lhere_\@:
- .pushsection .discard.reachable
- .long .Lhere_\@ - .
- .popsection
-.endm
-
-#endif /* __ASSEMBLY__ */
-
-#else /* !CONFIG_OBJTOOL */
-
-#ifndef __ASSEMBLY__
-
-#define UNWIND_HINT(sp_reg, sp_offset, type, signal, end) \
- "\n\t"
-#define STACK_FRAME_NON_STANDARD(func)
-#define STACK_FRAME_NON_STANDARD_FP(func)
-#define ANNOTATE_NOENDBR
-#define ASM_REACHABLE
-#else
-#define ANNOTATE_INTRA_FUNCTION_CALL
-.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 end=0
-.endm
-.macro STACK_FRAME_NON_STANDARD func:req
-.endm
-.macro ANNOTATE_NOENDBR
-.endm
-.macro REACHABLE
-.endm
-#endif
-
-#endif /* CONFIG_OBJTOOL */
-
-#endif /* _LINUX_OBJTOOL_H */
diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h
new file mode 100644
index 000000000000..453a4f4ef39d
--- /dev/null
+++ b/tools/include/linux/objtool_types.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_OBJTOOL_TYPES_H
+#define _LINUX_OBJTOOL_TYPES_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+/*
+ * This struct is used by asm and inline asm code to manually annotate the
+ * location of registers on the stack.
+ */
+struct unwind_hint {
+ u32 ip;
+ s16 sp_offset;
+ u8 sp_reg;
+ u8 type;
+ u8 signal;
+};
+
+#endif /* __ASSEMBLY__ */
+
+/*
+ * UNWIND_HINT_TYPE_UNDEFINED: A blind spot in ORC coverage which can result in
+ * a truncated and unreliable stack unwind.
+ *
+ * UNWIND_HINT_TYPE_END_OF_STACK: The end of the kernel stack unwind before
+ * hitting user entry, boot code, or fork entry (when there are no pt_regs
+ * available).
+ *
+ * UNWIND_HINT_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP
+ * (the caller's SP right before it made the call). Used for all callable
+ * functions, i.e. all C code and all callable asm functions.
+ *
+ * UNWIND_HINT_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset
+ * points to a fully populated pt_regs from a syscall, interrupt, or exception.
+ *
+ * UNWIND_HINT_TYPE_REGS_PARTIAL: Used in entry code to indicate that
+ * sp_reg+sp_offset points to the iret return frame.
+ *
+ * UNWIND_HINT_TYPE_FUNC: Generate the unwind metadata of a callable function.
+ * Useful for code which doesn't have an ELF function annotation.
+ *
+ * UNWIND_HINT_TYPE_{SAVE,RESTORE}: Save the unwind metadata at a certain
+ * location so that it can be restored later.
+ */
+#define UNWIND_HINT_TYPE_UNDEFINED 0
+#define UNWIND_HINT_TYPE_END_OF_STACK 1
+#define UNWIND_HINT_TYPE_CALL 2
+#define UNWIND_HINT_TYPE_REGS 3
+#define UNWIND_HINT_TYPE_REGS_PARTIAL 4
+/* The below hint types don't have corresponding ORC types */
+#define UNWIND_HINT_TYPE_FUNC 5
+#define UNWIND_HINT_TYPE_SAVE 6
+#define UNWIND_HINT_TYPE_RESTORE 7
+
+#endif /* _LINUX_OBJTOOL_TYPES_H */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index f937be1afe65..cae6ac6ff246 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -17,7 +17,7 @@
#include <objtool/warn.h>
#include <objtool/endianness.h>
-#include <linux/objtool.h>
+#include <linux/objtool_types.h>
#include <linux/hashtable.h>
#include <linux/kernel.h>
#include <linux/static_call_types.h>
@@ -202,6 +202,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
"__reiserfs_panic",
"__stack_chk_fail",
"__ubsan_handle_builtin_unreachable",
+ "arch_cpu_idle_dead",
"cpu_bringup_and_idle",
"cpu_startup_entry",
"do_exit",
@@ -2242,6 +2243,7 @@ static void set_func_state(struct cfi_state *state)
memcpy(&state->regs, &initial_func_cfi.regs,
CFI_NUM_REGS * sizeof(struct cfi_reg));
state->stack_size = initial_func_cfi.cfa.offset;
+ state->type = UNWIND_HINT_TYPE_CALL;
}
static int read_unwind_hints(struct objtool_file *file)
@@ -2306,16 +2308,9 @@ static int read_unwind_hints(struct objtool_file *file)
WARN_FUNC("UNWIND_HINT_IRET_REGS without ENDBR",
insn->sec, insn->offset);
}
-
- insn->entry = 1;
}
}
- if (hint->type == UNWIND_HINT_TYPE_ENTRY) {
- hint->type = UNWIND_HINT_TYPE_CALL;
- insn->entry = 1;
- }
-
if (hint->type == UNWIND_HINT_TYPE_FUNC) {
insn->cfi = &func_cfi;
continue;
@@ -2333,7 +2328,6 @@ static int read_unwind_hints(struct objtool_file *file)
cfi.cfa.offset = bswap_if_needed(file->elf, hint->sp_offset);
cfi.type = hint->type;
cfi.signal = hint->signal;
- cfi.end = hint->end;
insn->cfi = cfi_hash_find_or_add(&cfi);
}
@@ -2448,6 +2442,34 @@ static int read_instr_hints(struct objtool_file *file)
return 0;
}
+static int read_validate_unret_hints(struct objtool_file *file)
+{
+ struct section *sec;
+ struct instruction *insn;
+ struct reloc *reloc;
+
+ sec = find_section_by_name(file->elf, ".rela.discard.validate_unret");
+ if (!sec)
+ return 0;
+
+ list_for_each_entry(reloc, &sec->reloc_list, list) {
+ if (reloc->sym->type != STT_SECTION) {
+ WARN("unexpected relocation symbol type in %s", sec->name);
+ return -1;
+ }
+
+ insn = find_insn(file, reloc->sym->sec, reloc->addend);
+ if (!insn) {
+ WARN("bad .discard.instr_end entry");
+ return -1;
+ }
+ insn->unret = 1;
+ }
+
+ return 0;
+}
+
+
static int read_intra_function_calls(struct objtool_file *file)
{
struct instruction *insn;
@@ -2666,6 +2688,10 @@ static int decode_sections(struct objtool_file *file)
if (ret)
return ret;
+ ret = read_validate_unret_hints(file);
+ if (ret)
+ return ret;
+
return 0;
}
@@ -3862,10 +3888,10 @@ static int validate_unwind_hints(struct objtool_file *file, struct section *sec)
/*
* Validate rethunk entry constraint: must untrain RET before the first RET.
*
- * Follow every branch (intra-function) and ensure ANNOTATE_UNRET_END comes
+ * Follow every branch (intra-function) and ensure VALIDATE_UNRET_END comes
* before an actual RET instruction.
*/
-static int validate_entry(struct objtool_file *file, struct instruction *insn)
+static int validate_unret(struct objtool_file *file, struct instruction *insn)
{
struct instruction *next, *dest;
int ret, warnings = 0;
@@ -3873,10 +3899,10 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
for (;;) {
next = next_insn_to_validate(file, insn);
- if (insn->visited & VISITED_ENTRY)
+ if (insn->visited & VISITED_UNRET)
return 0;
- insn->visited |= VISITED_ENTRY;
+ insn->visited |= VISITED_UNRET;
if (!insn->ignore_alts && insn->alts) {
struct alternative *alt;
@@ -3886,7 +3912,7 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
if (alt->skip_orig)
skip_orig = true;
- ret = validate_entry(file, alt->insn);
+ ret = validate_unret(file, alt->insn);
if (ret) {
if (opts.backtrace)
BT_FUNC("(alt)", insn);
@@ -3914,7 +3940,7 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
insn->sec, insn->offset);
return -1;
}
- ret = validate_entry(file, insn->jump_dest);
+ ret = validate_unret(file, insn->jump_dest);
if (ret) {
if (opts.backtrace) {
BT_FUNC("(branch%s)", insn,
@@ -3939,7 +3965,7 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
return -1;
}
- ret = validate_entry(file, dest);
+ ret = validate_unret(file, dest);
if (ret) {
if (opts.backtrace)
BT_FUNC("(call)", insn);
@@ -3975,19 +4001,19 @@ static int validate_entry(struct objtool_file *file, struct instruction *insn)
}
/*
- * Validate that all branches starting at 'insn->entry' encounter UNRET_END
- * before RET.
+ * Validate that all branches starting at VALIDATE_UNRET_BEGIN encounter
+ * VALIDATE_UNRET_END before RET.
*/
-static int validate_unret(struct objtool_file *file)
+static int validate_unrets(struct objtool_file *file)
{
struct instruction *insn;
int ret, warnings = 0;
for_each_insn(file, insn) {
- if (!insn->entry)
+ if (!insn->unret)
continue;
- ret = validate_entry(file, insn);
+ ret = validate_unret(file, insn);
if (ret < 0) {
WARN_FUNC("Failed UNRET validation", insn->sec, insn->offset);
return ret;
@@ -4606,7 +4632,7 @@ int check(struct objtool_file *file)
* Must be after validate_branch() and friends, it plays
* further games with insn->visited.
*/
- ret = validate_unret(file);
+ ret = validate_unrets(file);
if (ret < 0)
return ret;
warnings += ret;
diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h
index 3e7c7004f7df..daa46f1f0965 100644
--- a/tools/objtool/include/objtool/check.h
+++ b/tools/objtool/include/objtool/check.h
@@ -61,7 +61,7 @@ struct instruction {
restore : 1,
retpoline_safe : 1,
noendbr : 1,
- entry : 1,
+ unret : 1,
visited : 4,
no_reloc : 1;
/* 10 bit hole */
@@ -92,7 +92,7 @@ static inline struct symbol *insn_func(struct instruction *insn)
#define VISITED_BRANCH 0x01
#define VISITED_BRANCH_UACCESS 0x02
#define VISITED_BRANCH_MASK 0x03
-#define VISITED_ENTRY 0x04
+#define VISITED_UNRET 0x04
static inline bool is_static_jump(struct instruction *insn)
{
diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c
index 2d8ebdcd1db3..0e183bb1c720 100644
--- a/tools/objtool/orc_dump.c
+++ b/tools/objtool/orc_dump.c
@@ -4,7 +4,6 @@
*/
#include <unistd.h>
-#include <linux/objtool.h>
#include <asm/orc_types.h>
#include <objtool/objtool.h>
#include <objtool/warn.h>
@@ -39,11 +38,15 @@ static const char *reg_name(unsigned int reg)
static const char *orc_type_name(unsigned int type)
{
switch (type) {
- case UNWIND_HINT_TYPE_CALL:
+ case ORC_TYPE_UNDEFINED:
+ return "(und)";
+ case ORC_TYPE_END_OF_STACK:
+ return "end";
+ case ORC_TYPE_CALL:
return "call";
- case UNWIND_HINT_TYPE_REGS:
+ case ORC_TYPE_REGS:
return "regs";
- case UNWIND_HINT_TYPE_REGS_PARTIAL:
+ case ORC_TYPE_REGS_PARTIAL:
return "regs (partial)";
default:
return "?";
@@ -202,6 +205,7 @@ int orc_dump(const char *_objname)
printf("%llx:", (unsigned long long)(orc_ip_addr + (i * sizeof(int)) + orc_ip[i]));
}
+ printf("type:%s", orc_type_name(orc[i].type));
printf(" sp:");
@@ -211,8 +215,7 @@ int orc_dump(const char *_objname)
print_reg(orc[i].bp_reg, bswap_if_needed(&dummy_elf, orc[i].bp_offset));
- printf(" type:%s signal:%d end:%d\n",
- orc_type_name(orc[i].type), orc[i].signal, orc[i].end);
+ printf(" signal:%d\n", orc[i].signal);
}
elf_end(elf);
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index 57a4527d5988..b327f9ccfe73 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -6,7 +6,7 @@
#include <stdlib.h>
#include <string.h>
-#include <linux/objtool.h>
+#include <linux/objtool_types.h>
#include <asm/orc_types.h>
#include <objtool/check.h>
@@ -21,19 +21,39 @@ static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi,
memset(orc, 0, sizeof(*orc));
if (!cfi) {
- orc->end = 0;
- orc->sp_reg = ORC_REG_UNDEFINED;
+ /*
+ * This is usually either unreachable nops/traps (which don't
+ * trigger unreachable instruction warnings), or
+ * STACK_FRAME_NON_STANDARD functions.
+ */
+ orc->type = ORC_TYPE_UNDEFINED;
return 0;
}
- orc->end = cfi->end;
- orc->signal = cfi->signal;
-
- if (cfi->cfa.base == CFI_UNDEFINED) {
- orc->sp_reg = ORC_REG_UNDEFINED;
+ switch (cfi->type) {
+ case UNWIND_HINT_TYPE_UNDEFINED:
+ orc->type = ORC_TYPE_UNDEFINED;
+ return 0;
+ case UNWIND_HINT_TYPE_END_OF_STACK:
+ orc->type = ORC_TYPE_END_OF_STACK;
return 0;
+ case UNWIND_HINT_TYPE_CALL:
+ orc->type = ORC_TYPE_CALL;
+ break;
+ case UNWIND_HINT_TYPE_REGS:
+ orc->type = ORC_TYPE_REGS;
+ break;
+ case UNWIND_HINT_TYPE_REGS_PARTIAL:
+ orc->type = ORC_TYPE_REGS_PARTIAL;
+ break;
+ default:
+ WARN_FUNC("unknown unwind hint type %d",
+ insn->sec, insn->offset, cfi->type);
+ return -1;
}
+ orc->signal = cfi->signal;
+
switch (cfi->cfa.base) {
case CFI_SP:
orc->sp_reg = ORC_REG_SP;
@@ -83,7 +103,6 @@ static int init_orc_entry(struct orc_entry *orc, struct cfi_state *cfi,
orc->sp_offset = cfi->cfa.offset;
orc->bp_offset = bp->offset;
- orc->type = cfi->type;
return 0;
}
@@ -148,11 +167,7 @@ int orc_create(struct objtool_file *file)
struct orc_list_entry *entry;
struct list_head orc_list;
- struct orc_entry null = {
- .sp_reg = ORC_REG_UNDEFINED,
- .bp_reg = ORC_REG_UNDEFINED,
- .type = UNWIND_HINT_TYPE_CALL,
- };
+ struct orc_entry null = { .type = ORC_TYPE_UNDEFINED };
/* Build a deduplicated list of ORC entries: */
INIT_LIST_HEAD(&orc_list);
diff --git a/tools/objtool/sync-check.sh b/tools/objtool/sync-check.sh
index 105a291ff8e7..81d120d05442 100755
--- a/tools/objtool/sync-check.sh
+++ b/tools/objtool/sync-check.sh
@@ -6,7 +6,7 @@ if [ -z "$SRCARCH" ]; then
exit 1
fi
-FILES="include/linux/objtool.h"
+FILES="include/linux/objtool_types.h"
if [ "$SRCARCH" = "x86" ]; then
FILES="$FILES
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index ca9374b56ead..7e8c937627dd 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -18,7 +18,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \
- corrupt_xstate_header amx
+ corrupt_xstate_header amx lam test_shadow_stack
# Some selftests require 32bit support enabled also on 64bit systems
TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c
new file mode 100644
index 000000000000..eb0e46905bf9
--- /dev/null
+++ b/tools/testing/selftests/x86/lam.c
@@ -0,0 +1,1241 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <sys/mman.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sched.h>
+
+#include <sys/uio.h>
+#include <linux/io_uring.h>
+#include "../kselftest.h"
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+/* LAM modes, these definitions were copied from kernel code */
+#define LAM_NONE 0
+#define LAM_U57_BITS 6
+
+#define LAM_U57_MASK (0x3fULL << 57)
+/* arch prctl for LAM */
+#define ARCH_GET_UNTAG_MASK 0x4001
+#define ARCH_ENABLE_TAGGED_ADDR 0x4002
+#define ARCH_GET_MAX_TAG_BITS 0x4003
+#define ARCH_FORCE_TAGGED_SVA 0x4004
+
+/* Specified test function bits */
+#define FUNC_MALLOC 0x1
+#define FUNC_BITS 0x2
+#define FUNC_MMAP 0x4
+#define FUNC_SYSCALL 0x8
+#define FUNC_URING 0x10
+#define FUNC_INHERITE 0x20
+#define FUNC_PASID 0x40
+
+#define TEST_MASK 0x7f
+
+#define LOW_ADDR (0x1UL << 30)
+#define HIGH_ADDR (0x3UL << 48)
+
+#define MALLOC_LEN 32
+
+#define PAGE_SIZE (4 << 10)
+
+#define STACK_SIZE 65536
+
+#define barrier() ({ \
+ __asm__ __volatile__("" : : : "memory"); \
+})
+
+#define URING_QUEUE_SZ 1
+#define URING_BLOCK_SZ 2048
+
+/* Pasid test define */
+#define LAM_CMD_BIT 0x1
+#define PAS_CMD_BIT 0x2
+#define SVA_CMD_BIT 0x4
+
+#define PAS_CMD(cmd1, cmd2, cmd3) (((cmd3) << 8) | ((cmd2) << 4) | ((cmd1) << 0))
+
+struct testcases {
+ unsigned int later;
+ int expected; /* 2: SIGSEGV Error; 1: other errors */
+ unsigned long lam;
+ uint64_t addr;
+ uint64_t cmd;
+ int (*test_func)(struct testcases *test);
+ const char *msg;
+};
+
+/* Used by CQ of uring, source file handler and file's size */
+struct file_io {
+ int file_fd;
+ off_t file_sz;
+ struct iovec iovecs[];
+};
+
+struct io_uring_queue {
+ unsigned int *head;
+ unsigned int *tail;
+ unsigned int *ring_mask;
+ unsigned int *ring_entries;
+ unsigned int *flags;
+ unsigned int *array;
+ union {
+ struct io_uring_cqe *cqes;
+ struct io_uring_sqe *sqes;
+ } queue;
+ size_t ring_sz;
+};
+
+struct io_ring {
+ int ring_fd;
+ struct io_uring_queue sq_ring;
+ struct io_uring_queue cq_ring;
+};
+
+int tests_cnt;
+jmp_buf segv_env;
+
+static void segv_handler(int sig)
+{
+ ksft_print_msg("Get segmentation fault(%d).", sig);
+
+ siglongjmp(segv_env, 1);
+}
+
+static inline int cpu_has_lam(void)
+{
+ unsigned int cpuinfo[4];
+
+ __cpuid_count(0x7, 1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
+
+ return (cpuinfo[0] & (1 << 26));
+}
+
+/* Check 5-level page table feature in CPUID.(EAX=07H, ECX=00H):ECX.[bit 16] */
+static inline int cpu_has_la57(void)
+{
+ unsigned int cpuinfo[4];
+
+ __cpuid_count(0x7, 0, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
+
+ return (cpuinfo[2] & (1 << 16));
+}
+
+/*
+ * Set tagged address and read back untag mask.
+ * check if the untagged mask is expected.
+ *
+ * @return:
+ * 0: Set LAM mode successfully
+ * others: failed to set LAM
+ */
+static int set_lam(unsigned long lam)
+{
+ int ret = 0;
+ uint64_t ptr = 0;
+
+ if (lam != LAM_U57_BITS && lam != LAM_NONE)
+ return -1;
+
+ /* Skip check return */
+ syscall(SYS_arch_prctl, ARCH_ENABLE_TAGGED_ADDR, lam);
+
+ /* Get untagged mask */
+ syscall(SYS_arch_prctl, ARCH_GET_UNTAG_MASK, &ptr);
+
+ /* Check mask returned is expected */
+ if (lam == LAM_U57_BITS)
+ ret = (ptr != ~(LAM_U57_MASK));
+ else if (lam == LAM_NONE)
+ ret = (ptr != -1ULL);
+
+ return ret;
+}
+
+static unsigned long get_default_tag_bits(void)
+{
+ pid_t pid;
+ int lam = LAM_NONE;
+ int ret = 0;
+
+ pid = fork();
+ if (pid < 0) {
+ perror("Fork failed.");
+ } else if (pid == 0) {
+ /* Set LAM mode in child process */
+ if (set_lam(LAM_U57_BITS) == 0)
+ lam = LAM_U57_BITS;
+ else
+ lam = LAM_NONE;
+ exit(lam);
+ } else {
+ wait(&ret);
+ lam = WEXITSTATUS(ret);
+ }
+
+ return lam;
+}
+
+/*
+ * Set tagged address and read back untag mask.
+ * check if the untag mask is expected.
+ */
+static int get_lam(void)
+{
+ uint64_t ptr = 0;
+ int ret = -1;
+ /* Get untagged mask */
+ if (syscall(SYS_arch_prctl, ARCH_GET_UNTAG_MASK, &ptr) == -1)
+ return -1;
+
+ /* Check mask returned is expected */
+ if (ptr == ~(LAM_U57_MASK))
+ ret = LAM_U57_BITS;
+ else if (ptr == -1ULL)
+ ret = LAM_NONE;
+
+
+ return ret;
+}
+
+/* According to LAM mode, set metadata in high bits */
+static uint64_t set_metadata(uint64_t src, unsigned long lam)
+{
+ uint64_t metadata;
+
+ srand(time(NULL));
+
+ switch (lam) {
+ case LAM_U57_BITS: /* Set metadata in bits 62:57 */
+ /* Get a random non-zero value as metadata */
+ metadata = (rand() % ((1UL << LAM_U57_BITS) - 1) + 1) << 57;
+ metadata |= (src & ~(LAM_U57_MASK));
+ break;
+ default:
+ metadata = src;
+ break;
+ }
+
+ return metadata;
+}
+
+/*
+ * Set metadata in user pointer, compare new pointer with original pointer.
+ * both pointers should point to the same address.
+ *
+ * @return:
+ * 0: value on the pointer with metadate and value on original are same
+ * 1: not same.
+ */
+static int handle_lam_test(void *src, unsigned int lam)
+{
+ char *ptr;
+
+ strcpy((char *)src, "USER POINTER");
+
+ ptr = (char *)set_metadata((uint64_t)src, lam);
+ if (src == ptr)
+ return 0;
+
+ /* Copy a string into the pointer with metadata */
+ strcpy((char *)ptr, "METADATA POINTER");
+
+ return (!!strcmp((char *)src, (char *)ptr));
+}
+
+
+int handle_max_bits(struct testcases *test)
+{
+ unsigned long exp_bits = get_default_tag_bits();
+ unsigned long bits = 0;
+
+ if (exp_bits != LAM_NONE)
+ exp_bits = LAM_U57_BITS;
+
+ /* Get LAM max tag bits */
+ if (syscall(SYS_arch_prctl, ARCH_GET_MAX_TAG_BITS, &bits) == -1)
+ return 1;
+
+ return (exp_bits != bits);
+}
+
+/*
+ * Test lam feature through dereference pointer get from malloc.
+ * @return 0: Pass test. 1: Get failure during test 2: Get SIGSEGV
+ */
+static int handle_malloc(struct testcases *test)
+{
+ char *ptr = NULL;
+ int ret = 0;
+
+ if (test->later == 0 && test->lam != 0)
+ if (set_lam(test->lam) == -1)
+ return 1;
+
+ ptr = (char *)malloc(MALLOC_LEN);
+ if (ptr == NULL) {
+ perror("malloc() failure\n");
+ return 1;
+ }
+
+ /* Set signal handler */
+ if (sigsetjmp(segv_env, 1) == 0) {
+ signal(SIGSEGV, segv_handler);
+ ret = handle_lam_test(ptr, test->lam);
+ } else {
+ ret = 2;
+ }
+
+ if (test->later != 0 && test->lam != 0)
+ if (set_lam(test->lam) == -1 && ret == 0)
+ ret = 1;
+
+ free(ptr);
+
+ return ret;
+}
+
+static int handle_mmap(struct testcases *test)
+{
+ void *ptr;
+ unsigned int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED;
+ int ret = 0;
+
+ if (test->later == 0 && test->lam != 0)
+ if (set_lam(test->lam) != 0)
+ return 1;
+
+ ptr = mmap((void *)test->addr, PAGE_SIZE, PROT_READ | PROT_WRITE,
+ flags, -1, 0);
+ if (ptr == MAP_FAILED) {
+ if (test->addr == HIGH_ADDR)
+ if (!cpu_has_la57())
+ return 3; /* unsupport LA57 */
+ return 1;
+ }
+
+ if (test->later != 0 && test->lam != 0)
+ if (set_lam(test->lam) != 0)
+ ret = 1;
+
+ if (ret == 0) {
+ if (sigsetjmp(segv_env, 1) == 0) {
+ signal(SIGSEGV, segv_handler);
+ ret = handle_lam_test(ptr, test->lam);
+ } else {
+ ret = 2;
+ }
+ }
+
+ munmap(ptr, PAGE_SIZE);
+ return ret;
+}
+
+static int handle_syscall(struct testcases *test)
+{
+ struct utsname unme, *pu;
+ int ret = 0;
+
+ if (test->later == 0 && test->lam != 0)
+ if (set_lam(test->lam) != 0)
+ return 1;
+
+ if (sigsetjmp(segv_env, 1) == 0) {
+ signal(SIGSEGV, segv_handler);
+ pu = (struct utsname *)set_metadata((uint64_t)&unme, test->lam);
+ ret = uname(pu);
+ if (ret < 0)
+ ret = 1;
+ } else {
+ ret = 2;
+ }
+
+ if (test->later != 0 && test->lam != 0)
+ if (set_lam(test->lam) != -1 && ret == 0)
+ ret = 1;
+
+ return ret;
+}
+
+int sys_uring_setup(unsigned int entries, struct io_uring_params *p)
+{
+ return (int)syscall(__NR_io_uring_setup, entries, p);
+}
+
+int sys_uring_enter(int fd, unsigned int to, unsigned int min, unsigned int flags)
+{
+ return (int)syscall(__NR_io_uring_enter, fd, to, min, flags, NULL, 0);
+}
+
+/* Init submission queue and completion queue */
+int mmap_io_uring(struct io_uring_params p, struct io_ring *s)
+{
+ struct io_uring_queue *sring = &s->sq_ring;
+ struct io_uring_queue *cring = &s->cq_ring;
+
+ sring->ring_sz = p.sq_off.array + p.sq_entries * sizeof(unsigned int);
+ cring->ring_sz = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
+
+ if (p.features & IORING_FEAT_SINGLE_MMAP) {
+ if (cring->ring_sz > sring->ring_sz)
+ sring->ring_sz = cring->ring_sz;
+
+ cring->ring_sz = sring->ring_sz;
+ }
+
+ void *sq_ptr = mmap(0, sring->ring_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, s->ring_fd,
+ IORING_OFF_SQ_RING);
+
+ if (sq_ptr == MAP_FAILED) {
+ perror("sub-queue!");
+ return 1;
+ }
+
+ void *cq_ptr = sq_ptr;
+
+ if (!(p.features & IORING_FEAT_SINGLE_MMAP)) {
+ cq_ptr = mmap(0, cring->ring_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, s->ring_fd,
+ IORING_OFF_CQ_RING);
+ if (cq_ptr == MAP_FAILED) {
+ perror("cpl-queue!");
+ munmap(sq_ptr, sring->ring_sz);
+ return 1;
+ }
+ }
+
+ sring->head = sq_ptr + p.sq_off.head;
+ sring->tail = sq_ptr + p.sq_off.tail;
+ sring->ring_mask = sq_ptr + p.sq_off.ring_mask;
+ sring->ring_entries = sq_ptr + p.sq_off.ring_entries;
+ sring->flags = sq_ptr + p.sq_off.flags;
+ sring->array = sq_ptr + p.sq_off.array;
+
+ /* Map a queue as mem map */
+ s->sq_ring.queue.sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
+ s->ring_fd, IORING_OFF_SQES);
+ if (s->sq_ring.queue.sqes == MAP_FAILED) {
+ munmap(sq_ptr, sring->ring_sz);
+ if (sq_ptr != cq_ptr) {
+ ksft_print_msg("failed to mmap uring queue!");
+ munmap(cq_ptr, cring->ring_sz);
+ return 1;
+ }
+ }
+
+ cring->head = cq_ptr + p.cq_off.head;
+ cring->tail = cq_ptr + p.cq_off.tail;
+ cring->ring_mask = cq_ptr + p.cq_off.ring_mask;
+ cring->ring_entries = cq_ptr + p.cq_off.ring_entries;
+ cring->queue.cqes = cq_ptr + p.cq_off.cqes;
+
+ return 0;
+}
+
+/* Init io_uring queues */
+int setup_io_uring(struct io_ring *s)
+{
+ struct io_uring_params para;
+
+ memset(&para, 0, sizeof(para));
+ s->ring_fd = sys_uring_setup(URING_QUEUE_SZ, &para);
+ if (s->ring_fd < 0)
+ return 1;
+
+ return mmap_io_uring(para, s);
+}
+
+/*
+ * Get data from completion queue. the data buffer saved the file data
+ * return 0: success; others: error;
+ */
+int handle_uring_cq(struct io_ring *s)
+{
+ struct file_io *fi = NULL;
+ struct io_uring_queue *cring = &s->cq_ring;
+ struct io_uring_cqe *cqe;
+ unsigned int head;
+ off_t len = 0;
+
+ head = *cring->head;
+
+ do {
+ barrier();
+ if (head == *cring->tail)
+ break;
+ /* Get the entry */
+ cqe = &cring->queue.cqes[head & *s->cq_ring.ring_mask];
+ fi = (struct file_io *)cqe->user_data;
+ if (cqe->res < 0)
+ break;
+
+ int blocks = (int)(fi->file_sz + URING_BLOCK_SZ - 1) / URING_BLOCK_SZ;
+
+ for (int i = 0; i < blocks; i++)
+ len += fi->iovecs[i].iov_len;
+
+ head++;
+ } while (1);
+
+ *cring->head = head;
+ barrier();
+
+ return (len != fi->file_sz);
+}
+
+/*
+ * Submit squeue. specify via IORING_OP_READV.
+ * the buffer need to be set metadata according to LAM mode
+ */
+int handle_uring_sq(struct io_ring *ring, struct file_io *fi, unsigned long lam)
+{
+ int file_fd = fi->file_fd;
+ struct io_uring_queue *sring = &ring->sq_ring;
+ unsigned int index = 0, cur_block = 0, tail = 0, next_tail = 0;
+ struct io_uring_sqe *sqe;
+
+ off_t remain = fi->file_sz;
+ int blocks = (int)(remain + URING_BLOCK_SZ - 1) / URING_BLOCK_SZ;
+
+ while (remain) {
+ off_t bytes = remain;
+ void *buf;
+
+ if (bytes > URING_BLOCK_SZ)
+ bytes = URING_BLOCK_SZ;
+
+ fi->iovecs[cur_block].iov_len = bytes;
+
+ if (posix_memalign(&buf, URING_BLOCK_SZ, URING_BLOCK_SZ))
+ return 1;
+
+ fi->iovecs[cur_block].iov_base = (void *)set_metadata((uint64_t)buf, lam);
+ remain -= bytes;
+ cur_block++;
+ }
+
+ next_tail = *sring->tail;
+ tail = next_tail;
+ next_tail++;
+
+ barrier();
+
+ index = tail & *ring->sq_ring.ring_mask;
+
+ sqe = &ring->sq_ring.queue.sqes[index];
+ sqe->fd = file_fd;
+ sqe->flags = 0;
+ sqe->opcode = IORING_OP_READV;
+ sqe->addr = (unsigned long)fi->iovecs;
+ sqe->len = blocks;
+ sqe->off = 0;
+ sqe->user_data = (uint64_t)fi;
+
+ sring->array[index] = index;
+ tail = next_tail;
+
+ if (*sring->tail != tail) {
+ *sring->tail = tail;
+ barrier();
+ }
+
+ if (sys_uring_enter(ring->ring_fd, 1, 1, IORING_ENTER_GETEVENTS) < 0)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Test LAM in async I/O and io_uring, read current binery through io_uring
+ * Set metadata in pointers to iovecs buffer.
+ */
+int do_uring(unsigned long lam)
+{
+ struct io_ring *ring;
+ struct file_io *fi;
+ struct stat st;
+ int ret = 1;
+ char path[PATH_MAX] = {0};
+
+ /* get current process path */
+ if (readlink("/proc/self/exe", path, PATH_MAX) <= 0)
+ return 1;
+
+ int file_fd = open(path, O_RDONLY);
+
+ if (file_fd < 0)
+ return 1;
+
+ if (fstat(file_fd, &st) < 0)
+ return 1;
+
+ off_t file_sz = st.st_size;
+
+ int blocks = (int)(file_sz + URING_BLOCK_SZ - 1) / URING_BLOCK_SZ;
+
+ fi = malloc(sizeof(*fi) + sizeof(struct iovec) * blocks);
+ if (!fi)
+ return 1;
+
+ fi->file_sz = file_sz;
+ fi->file_fd = file_fd;
+
+ ring = malloc(sizeof(*ring));
+ if (!ring)
+ return 1;
+
+ memset(ring, 0, sizeof(struct io_ring));
+
+ if (setup_io_uring(ring))
+ goto out;
+
+ if (handle_uring_sq(ring, fi, lam))
+ goto out;
+
+ ret = handle_uring_cq(ring);
+
+out:
+ free(ring);
+
+ for (int i = 0; i < blocks; i++) {
+ if (fi->iovecs[i].iov_base) {
+ uint64_t addr = ((uint64_t)fi->iovecs[i].iov_base);
+
+ switch (lam) {
+ case LAM_U57_BITS: /* Clear bits 62:57 */
+ addr = (addr & ~(LAM_U57_MASK));
+ break;
+ }
+ free((void *)addr);
+ fi->iovecs[i].iov_base = NULL;
+ }
+ }
+
+ free(fi);
+
+ return ret;
+}
+
+int handle_uring(struct testcases *test)
+{
+ int ret = 0;
+
+ if (test->later == 0 && test->lam != 0)
+ if (set_lam(test->lam) != 0)
+ return 1;
+
+ if (sigsetjmp(segv_env, 1) == 0) {
+ signal(SIGSEGV, segv_handler);
+ ret = do_uring(test->lam);
+ } else {
+ ret = 2;
+ }
+
+ return ret;
+}
+
+static int fork_test(struct testcases *test)
+{
+ int ret, child_ret;
+ pid_t pid;
+
+ pid = fork();
+ if (pid < 0) {
+ perror("Fork failed.");
+ ret = 1;
+ } else if (pid == 0) {
+ ret = test->test_func(test);
+ exit(ret);
+ } else {
+ wait(&child_ret);
+ ret = WEXITSTATUS(child_ret);
+ }
+
+ return ret;
+}
+
+static int handle_execve(struct testcases *test)
+{
+ int ret, child_ret;
+ int lam = test->lam;
+ pid_t pid;
+
+ pid = fork();
+ if (pid < 0) {
+ perror("Fork failed.");
+ ret = 1;
+ } else if (pid == 0) {
+ char path[PATH_MAX];
+
+ /* Set LAM mode in parent process */
+ if (set_lam(lam) != 0)
+ return 1;
+
+ /* Get current binary's path and the binary was run by execve */
+ if (readlink("/proc/self/exe", path, PATH_MAX) <= 0)
+ exit(-1);
+
+ /* run binary to get LAM mode and return to parent process */
+ if (execlp(path, path, "-t 0x0", NULL) < 0) {
+ perror("error on exec");
+ exit(-1);
+ }
+ } else {
+ wait(&child_ret);
+ ret = WEXITSTATUS(child_ret);
+ if (ret != LAM_NONE)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int handle_inheritance(struct testcases *test)
+{
+ int ret, child_ret;
+ int lam = test->lam;
+ pid_t pid;
+
+ /* Set LAM mode in parent process */
+ if (set_lam(lam) != 0)
+ return 1;
+
+ pid = fork();
+ if (pid < 0) {
+ perror("Fork failed.");
+ return 1;
+ } else if (pid == 0) {
+ /* Set LAM mode in parent process */
+ int child_lam = get_lam();
+
+ exit(child_lam);
+ } else {
+ wait(&child_ret);
+ ret = WEXITSTATUS(child_ret);
+
+ if (lam != ret)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int thread_fn_get_lam(void *arg)
+{
+ return get_lam();
+}
+
+static int thread_fn_set_lam(void *arg)
+{
+ struct testcases *test = arg;
+
+ return set_lam(test->lam);
+}
+
+static int handle_thread(struct testcases *test)
+{
+ char stack[STACK_SIZE];
+ int ret, child_ret;
+ int lam = 0;
+ pid_t pid;
+
+ /* Set LAM mode in parent process */
+ if (!test->later) {
+ lam = test->lam;
+ if (set_lam(lam) != 0)
+ return 1;
+ }
+
+ pid = clone(thread_fn_get_lam, stack + STACK_SIZE,
+ SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, NULL);
+ if (pid < 0) {
+ perror("Clone failed.");
+ return 1;
+ }
+
+ waitpid(pid, &child_ret, 0);
+ ret = WEXITSTATUS(child_ret);
+
+ if (lam != ret)
+ return 1;
+
+ if (test->later) {
+ if (set_lam(test->lam) != 0)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int handle_thread_enable(struct testcases *test)
+{
+ char stack[STACK_SIZE];
+ int ret, child_ret;
+ int lam = test->lam;
+ pid_t pid;
+
+ pid = clone(thread_fn_set_lam, stack + STACK_SIZE,
+ SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM, test);
+ if (pid < 0) {
+ perror("Clone failed.");
+ return 1;
+ }
+
+ waitpid(pid, &child_ret, 0);
+ ret = WEXITSTATUS(child_ret);
+
+ if (lam != ret)
+ return 1;
+
+ return 0;
+}
+static void run_test(struct testcases *test, int count)
+{
+ int i, ret = 0;
+
+ for (i = 0; i < count; i++) {
+ struct testcases *t = test + i;
+
+ /* fork a process to run test case */
+ tests_cnt++;
+ ret = fork_test(t);
+
+ /* return 3 is not support LA57, the case should be skipped */
+ if (ret == 3) {
+ ksft_test_result_skip(t->msg);
+ continue;
+ }
+
+ if (ret != 0)
+ ret = (t->expected == ret);
+ else
+ ret = !(t->expected);
+
+ ksft_test_result(ret, t->msg);
+ }
+}
+
+static struct testcases uring_cases[] = {
+ {
+ .later = 0,
+ .lam = LAM_U57_BITS,
+ .test_func = handle_uring,
+ .msg = "URING: LAM_U57. Dereferencing pointer with metadata\n",
+ },
+ {
+ .later = 1,
+ .expected = 1,
+ .lam = LAM_U57_BITS,
+ .test_func = handle_uring,
+ .msg = "URING:[Negative] Disable LAM. Dereferencing pointer with metadata.\n",
+ },
+};
+
+static struct testcases malloc_cases[] = {
+ {
+ .later = 0,
+ .lam = LAM_U57_BITS,
+ .test_func = handle_malloc,
+ .msg = "MALLOC: LAM_U57. Dereferencing pointer with metadata\n",
+ },
+ {
+ .later = 1,
+ .expected = 2,
+ .lam = LAM_U57_BITS,
+ .test_func = handle_malloc,
+ .msg = "MALLOC:[Negative] Disable LAM. Dereferencing pointer with metadata.\n",
+ },
+};
+
+static struct testcases bits_cases[] = {
+ {
+ .test_func = handle_max_bits,
+ .msg = "BITS: Check default tag bits\n",
+ },
+};
+
+static struct testcases syscall_cases[] = {
+ {
+ .later = 0,
+ .lam = LAM_U57_BITS,
+ .test_func = handle_syscall,
+ .msg = "SYSCALL: LAM_U57. syscall with metadata\n",
+ },
+ {
+ .later = 1,
+ .expected = 1,
+ .lam = LAM_U57_BITS,
+ .test_func = handle_syscall,
+ .msg = "SYSCALL:[Negative] Disable LAM. Dereferencing pointer with metadata.\n",
+ },
+};
+
+static struct testcases mmap_cases[] = {
+ {
+ .later = 1,
+ .expected = 0,
+ .lam = LAM_U57_BITS,
+ .addr = HIGH_ADDR,
+ .test_func = handle_mmap,
+ .msg = "MMAP: First mmap high address, then set LAM_U57.\n",
+ },
+ {
+ .later = 0,
+ .expected = 0,
+ .lam = LAM_U57_BITS,
+ .addr = HIGH_ADDR,
+ .test_func = handle_mmap,
+ .msg = "MMAP: First LAM_U57, then High address.\n",
+ },
+ {
+ .later = 0,
+ .expected = 0,
+ .lam = LAM_U57_BITS,
+ .addr = LOW_ADDR,
+ .test_func = handle_mmap,
+ .msg = "MMAP: First LAM_U57, then Low address.\n",
+ },
+};
+
+static struct testcases inheritance_cases[] = {
+ {
+ .expected = 0,
+ .lam = LAM_U57_BITS,
+ .test_func = handle_inheritance,
+ .msg = "FORK: LAM_U57, child process should get LAM mode same as parent\n",
+ },
+ {
+ .expected = 0,
+ .lam = LAM_U57_BITS,
+ .test_func = handle_thread,
+ .msg = "THREAD: LAM_U57, child thread should get LAM mode same as parent\n",
+ },
+ {
+ .expected = 1,
+ .lam = LAM_U57_BITS,
+ .test_func = handle_thread_enable,
+ .msg = "THREAD: [NEGATIVE] Enable LAM in child.\n",
+ },
+ {
+ .expected = 1,
+ .later = 1,
+ .lam = LAM_U57_BITS,
+ .test_func = handle_thread,
+ .msg = "THREAD: [NEGATIVE] Enable LAM in parent after thread created.\n",
+ },
+ {
+ .expected = 0,
+ .lam = LAM_U57_BITS,
+ .test_func = handle_execve,
+ .msg = "EXECVE: LAM_U57, child process should get disabled LAM mode\n",
+ },
+};
+
+static void cmd_help(void)
+{
+ printf("usage: lam [-h] [-t test list]\n");
+ printf("\t-t test list: run tests specified in the test list, default:0x%x\n", TEST_MASK);
+ printf("\t\t0x1:malloc; 0x2:max_bits; 0x4:mmap; 0x8:syscall; 0x10:io_uring; 0x20:inherit;\n");
+ printf("\t-h: help\n");
+}
+
+/* Check for file existence */
+uint8_t file_Exists(const char *fileName)
+{
+ struct stat buffer;
+
+ uint8_t ret = (stat(fileName, &buffer) == 0);
+
+ return ret;
+}
+
+/* Sysfs idxd files */
+const char *dsa_configs[] = {
+ "echo 1 > /sys/bus/dsa/devices/dsa0/wq0.1/group_id",
+ "echo shared > /sys/bus/dsa/devices/dsa0/wq0.1/mode",
+ "echo 10 > /sys/bus/dsa/devices/dsa0/wq0.1/priority",
+ "echo 16 > /sys/bus/dsa/devices/dsa0/wq0.1/size",
+ "echo 15 > /sys/bus/dsa/devices/dsa0/wq0.1/threshold",
+ "echo user > /sys/bus/dsa/devices/dsa0/wq0.1/type",
+ "echo MyApp1 > /sys/bus/dsa/devices/dsa0/wq0.1/name",
+ "echo 1 > /sys/bus/dsa/devices/dsa0/engine0.1/group_id",
+ "echo dsa0 > /sys/bus/dsa/drivers/idxd/bind",
+ /* bind files and devices, generated a device file in /dev */
+ "echo wq0.1 > /sys/bus/dsa/drivers/user/bind",
+};
+
+/* DSA device file */
+const char *dsaDeviceFile = "/dev/dsa/wq0.1";
+/* file for io*/
+const char *dsaPasidEnable = "/sys/bus/dsa/devices/dsa0/pasid_enabled";
+
+/*
+ * DSA depends on kernel cmdline "intel_iommu=on,sm_on"
+ * return pasid_enabled (0: disable 1:enable)
+ */
+int Check_DSA_Kernel_Setting(void)
+{
+ char command[256] = "";
+ char buf[256] = "";
+ char *ptr;
+ int rv = -1;
+
+ snprintf(command, sizeof(command) - 1, "cat %s", dsaPasidEnable);
+
+ FILE *cmd = popen(command, "r");
+
+ if (cmd) {
+ while (fgets(buf, sizeof(buf) - 1, cmd) != NULL);
+
+ pclose(cmd);
+ rv = strtol(buf, &ptr, 16);
+ }
+
+ return rv;
+}
+
+/*
+ * Config DSA's sysfs files as shared DSA's WQ.
+ * Generated a device file /dev/dsa/wq0.1
+ * Return: 0 OK; 1 Failed; 3 Skip(SVA disabled).
+ */
+int Dsa_Init_Sysfs(void)
+{
+ uint len = ARRAY_SIZE(dsa_configs);
+ const char **p = dsa_configs;
+
+ if (file_Exists(dsaDeviceFile) == 1)
+ return 0;
+
+ /* check the idxd driver */
+ if (file_Exists(dsaPasidEnable) != 1) {
+ printf("Please make sure idxd driver was loaded\n");
+ return 3;
+ }
+
+ /* Check SVA feature */
+ if (Check_DSA_Kernel_Setting() != 1) {
+ printf("Please enable SVA.(Add intel_iommu=on,sm_on in kernel cmdline)\n");
+ return 3;
+ }
+
+ /* Check the idxd device file on /dev/dsa/ */
+ for (int i = 0; i < len; i++) {
+ if (system(p[i]))
+ return 1;
+ }
+
+ /* After config, /dev/dsa/wq0.1 should be generated */
+ return (file_Exists(dsaDeviceFile) != 1);
+}
+
+/*
+ * Open DSA device file, triger API: iommu_sva_alloc_pasid
+ */
+void *allocate_dsa_pasid(void)
+{
+ int fd;
+ void *wq;
+
+ fd = open(dsaDeviceFile, O_RDWR);
+ if (fd < 0) {
+ perror("open");
+ return MAP_FAILED;
+ }
+
+ wq = mmap(NULL, 0x1000, PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+ if (wq == MAP_FAILED)
+ perror("mmap");
+
+ return wq;
+}
+
+int set_force_svm(void)
+{
+ int ret = 0;
+
+ ret = syscall(SYS_arch_prctl, ARCH_FORCE_TAGGED_SVA);
+
+ return ret;
+}
+
+int handle_pasid(struct testcases *test)
+{
+ uint tmp = test->cmd;
+ uint runed = 0x0;
+ int ret = 0;
+ void *wq = NULL;
+
+ ret = Dsa_Init_Sysfs();
+ if (ret != 0)
+ return ret;
+
+ for (int i = 0; i < 3; i++) {
+ int err = 0;
+
+ if (tmp & 0x1) {
+ /* run set lam mode*/
+ if ((runed & 0x1) == 0) {
+ err = set_lam(LAM_U57_BITS);
+ runed = runed | 0x1;
+ } else
+ err = 1;
+ } else if (tmp & 0x4) {
+ /* run force svm */
+ if ((runed & 0x4) == 0) {
+ err = set_force_svm();
+ runed = runed | 0x4;
+ } else
+ err = 1;
+ } else if (tmp & 0x2) {
+ /* run allocate pasid */
+ if ((runed & 0x2) == 0) {
+ runed = runed | 0x2;
+ wq = allocate_dsa_pasid();
+ if (wq == MAP_FAILED)
+ err = 1;
+ } else
+ err = 1;
+ }
+
+ ret = ret + err;
+ if (ret > 0)
+ break;
+
+ tmp = tmp >> 4;
+ }
+
+ if (wq != MAP_FAILED && wq != NULL)
+ if (munmap(wq, 0x1000))
+ printf("munmap failed %d\n", errno);
+
+ if (runed != 0x7)
+ ret = 1;
+
+ return (ret != 0);
+}
+
+/*
+ * Pasid test depends on idxd and SVA, kernel should enable iommu and sm.
+ * command line(intel_iommu=on,sm_on)
+ */
+static struct testcases pasid_cases[] = {
+ {
+ .expected = 1,
+ .cmd = PAS_CMD(LAM_CMD_BIT, PAS_CMD_BIT, SVA_CMD_BIT),
+ .test_func = handle_pasid,
+ .msg = "PASID: [Negative] Execute LAM, PASID, SVA in sequence\n",
+ },
+ {
+ .expected = 0,
+ .cmd = PAS_CMD(LAM_CMD_BIT, SVA_CMD_BIT, PAS_CMD_BIT),
+ .test_func = handle_pasid,
+ .msg = "PASID: Execute LAM, SVA, PASID in sequence\n",
+ },
+ {
+ .expected = 1,
+ .cmd = PAS_CMD(PAS_CMD_BIT, LAM_CMD_BIT, SVA_CMD_BIT),
+ .test_func = handle_pasid,
+ .msg = "PASID: [Negative] Execute PASID, LAM, SVA in sequence\n",
+ },
+ {
+ .expected = 0,
+ .cmd = PAS_CMD(PAS_CMD_BIT, SVA_CMD_BIT, LAM_CMD_BIT),
+ .test_func = handle_pasid,
+ .msg = "PASID: Execute PASID, SVA, LAM in sequence\n",
+ },
+ {
+ .expected = 0,
+ .cmd = PAS_CMD(SVA_CMD_BIT, LAM_CMD_BIT, PAS_CMD_BIT),
+ .test_func = handle_pasid,
+ .msg = "PASID: Execute SVA, LAM, PASID in sequence\n",
+ },
+ {
+ .expected = 0,
+ .cmd = PAS_CMD(SVA_CMD_BIT, PAS_CMD_BIT, LAM_CMD_BIT),
+ .test_func = handle_pasid,
+ .msg = "PASID: Execute SVA, PASID, LAM in sequence\n",
+ },
+};
+
+int main(int argc, char **argv)
+{
+ int c = 0;
+ unsigned int tests = TEST_MASK;
+
+ tests_cnt = 0;
+
+ if (!cpu_has_lam()) {
+ ksft_print_msg("Unsupported LAM feature!\n");
+ return -1;
+ }
+
+ while ((c = getopt(argc, argv, "ht:")) != -1) {
+ switch (c) {
+ case 't':
+ tests = strtoul(optarg, NULL, 16);
+ if (tests && !(tests & TEST_MASK)) {
+ ksft_print_msg("Invalid argument!\n");
+ return -1;
+ }
+ break;
+ case 'h':
+ cmd_help();
+ return 0;
+ default:
+ ksft_print_msg("Invalid argument\n");
+ return -1;
+ }
+ }
+
+ /*
+ * When tests is 0, it is not a real test case;
+ * the option used by test case(execve) to check the lam mode in
+ * process generated by execve, the process read back lam mode and
+ * check with lam mode in parent process.
+ */
+ if (!tests)
+ return (get_lam());
+
+ /* Run test cases */
+ if (tests & FUNC_MALLOC)
+ run_test(malloc_cases, ARRAY_SIZE(malloc_cases));
+
+ if (tests & FUNC_BITS)
+ run_test(bits_cases, ARRAY_SIZE(bits_cases));
+
+ if (tests & FUNC_MMAP)
+ run_test(mmap_cases, ARRAY_SIZE(mmap_cases));
+
+ if (tests & FUNC_SYSCALL)
+ run_test(syscall_cases, ARRAY_SIZE(syscall_cases));
+
+ if (tests & FUNC_URING)
+ run_test(uring_cases, ARRAY_SIZE(uring_cases));
+
+ if (tests & FUNC_INHERITE)
+ run_test(inheritance_cases, ARRAY_SIZE(inheritance_cases));
+
+ if (tests & FUNC_PASID)
+ run_test(pasid_cases, ARRAY_SIZE(pasid_cases));
+
+ ksft_set_plan(tests_cnt);
+
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/x86/test_shadow_stack.c b/tools/testing/selftests/x86/test_shadow_stack.c
new file mode 100644
index 000000000000..94eb223456f6
--- /dev/null
+++ b/tools/testing/selftests/x86/test_shadow_stack.c
@@ -0,0 +1,695 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This program test's basic kernel shadow stack support. It enables shadow
+ * stack manual via the arch_prctl(), instead of relying on glibc. It's
+ * Makefile doesn't compile with shadow stack support, so it doesn't rely on
+ * any particular glibc. As a result it can't do any operations that require
+ * special glibc shadow stack support (longjmp(), swapcontext(), etc). Just
+ * stick to the basics and hope the compiler doesn't do anything strange.
+ */
+
+#define _GNU_SOURCE
+
+#include <sys/syscall.h>
+#include <asm/mman.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <x86intrin.h>
+#include <asm/prctl.h>
+#include <sys/prctl.h>
+#include <stdint.h>
+#include <signal.h>
+#include <pthread.h>
+#include <sys/ioctl.h>
+#include <linux/userfaultfd.h>
+#include <setjmp.h>
+
+/*
+ * Define the ABI defines if needed, so people can run the tests
+ * without building the headers.
+ */
+#ifndef __NR_map_shadow_stack
+#define __NR_map_shadow_stack 451
+
+#define SHADOW_STACK_SET_TOKEN (1ULL << 0)
+
+#define ARCH_SHSTK_ENABLE 0x5001
+#define ARCH_SHSTK_DISABLE 0x5002
+#define ARCH_SHSTK_LOCK 0x5003
+#define ARCH_SHSTK_UNLOCK 0x5004
+#define ARCH_SHSTK_STATUS 0x5005
+
+#define ARCH_SHSTK_SHSTK (1ULL << 0)
+#define ARCH_SHSTK_WRSS (1ULL << 1)
+#endif
+
+#define SS_SIZE 0x200000
+
+#if (__GNUC__ < 8) || (__GNUC__ == 8 && __GNUC_MINOR__ < 5)
+int main(int argc, char *argv[])
+{
+ printf("[SKIP]\tCompiler does not support CET.\n");
+ return 0;
+}
+#else
+void write_shstk(unsigned long *addr, unsigned long val)
+{
+ asm volatile("wrssq %[val], (%[addr])\n"
+ : "=m" (addr)
+ : [addr] "r" (addr), [val] "r" (val));
+}
+
+static inline unsigned long __attribute__((always_inline)) get_ssp(void)
+{
+ unsigned long ret = 0;
+
+ asm volatile("xor %0, %0; rdsspq %0" : "=r" (ret));
+ return ret;
+}
+
+/*
+ * For use in inline enablement of shadow stack.
+ *
+ * The program can't return from the point where shadow stack gets enabled
+ * because there will be no address on the shadow stack. So it can't use
+ * syscall() for enablement, since it is a function.
+ *
+ * Based on code from nolibc.h. Keep a copy here because this can't pull in all
+ * of nolibc.h.
+ */
+#define ARCH_PRCTL(arg1, arg2) \
+({ \
+ long _ret; \
+ register long _num asm("eax") = __NR_arch_prctl; \
+ register long _arg1 asm("rdi") = (long)(arg1); \
+ register long _arg2 asm("rsi") = (long)(arg2); \
+ \
+ asm volatile ( \
+ "syscall\n" \
+ : "=a"(_ret) \
+ : "r"(_arg1), "r"(_arg2), \
+ "0"(_num) \
+ : "rcx", "r11", "memory", "cc" \
+ ); \
+ _ret; \
+})
+
+void *create_shstk(void *addr)
+{
+ return (void *)syscall(__NR_map_shadow_stack, addr, SS_SIZE, SHADOW_STACK_SET_TOKEN);
+}
+
+void *create_normal_mem(void *addr)
+{
+ return mmap(addr, SS_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+}
+
+void free_shstk(void *shstk)
+{
+ munmap(shstk, SS_SIZE);
+}
+
+int reset_shstk(void *shstk)
+{
+ return madvise(shstk, SS_SIZE, MADV_DONTNEED);
+}
+
+void try_shstk(unsigned long new_ssp)
+{
+ unsigned long ssp;
+
+ printf("[INFO]\tnew_ssp = %lx, *new_ssp = %lx\n",
+ new_ssp, *((unsigned long *)new_ssp));
+
+ ssp = get_ssp();
+ printf("[INFO]\tchanging ssp from %lx to %lx\n", ssp, new_ssp);
+
+ asm volatile("rstorssp (%0)\n":: "r" (new_ssp));
+ asm volatile("saveprevssp");
+ printf("[INFO]\tssp is now %lx\n", get_ssp());
+
+ /* Switch back to original shadow stack */
+ ssp -= 8;
+ asm volatile("rstorssp (%0)\n":: "r" (ssp));
+ asm volatile("saveprevssp");
+}
+
+int test_shstk_pivot(void)
+{
+ void *shstk = create_shstk(0);
+
+ if (shstk == MAP_FAILED) {
+ printf("[FAIL]\tError creating shadow stack: %d\n", errno);
+ return 1;
+ }
+ try_shstk((unsigned long)shstk + SS_SIZE - 8);
+ free_shstk(shstk);
+
+ printf("[OK]\tShadow stack pivot\n");
+ return 0;
+}
+
+int test_shstk_faults(void)
+{
+ unsigned long *shstk = create_shstk(0);
+
+ /* Read shadow stack, test if it's zero to not get read optimized out */
+ if (*shstk != 0)
+ goto err;
+
+ /* Wrss memory that was already read. */
+ write_shstk(shstk, 1);
+ if (*shstk != 1)
+ goto err;
+
+ /* Page out memory, so we can wrss it again. */
+ if (reset_shstk((void *)shstk))
+ goto err;
+
+ write_shstk(shstk, 1);
+ if (*shstk != 1)
+ goto err;
+
+ printf("[OK]\tShadow stack faults\n");
+ return 0;
+
+err:
+ return 1;
+}
+
+unsigned long saved_ssp;
+unsigned long saved_ssp_val;
+volatile bool segv_triggered;
+
+void __attribute__((noinline)) violate_ss(void)
+{
+ saved_ssp = get_ssp();
+ saved_ssp_val = *(unsigned long *)saved_ssp;
+
+ /* Corrupt shadow stack */
+ printf("[INFO]\tCorrupting shadow stack\n");
+ write_shstk((void *)saved_ssp, 0);
+}
+
+void segv_handler(int signum, siginfo_t *si, void *uc)
+{
+ printf("[INFO]\tGenerated shadow stack violation successfully\n");
+
+ segv_triggered = true;
+
+ /* Fix shadow stack */
+ write_shstk((void *)saved_ssp, saved_ssp_val);
+}
+
+int test_shstk_violation(void)
+{
+ struct sigaction sa;
+
+ sa.sa_sigaction = segv_handler;
+ if (sigaction(SIGSEGV, &sa, NULL))
+ return 1;
+ sa.sa_flags = SA_SIGINFO;
+
+ segv_triggered = false;
+
+ /* Make sure segv_triggered is set before violate_ss() */
+ asm volatile("" : : : "memory");
+
+ violate_ss();
+
+ signal(SIGSEGV, SIG_DFL);
+
+ printf("[OK]\tShadow stack violation test\n");
+
+ return !segv_triggered;
+}
+
+/* Gup test state */
+#define MAGIC_VAL 0x12345678
+bool is_shstk_access;
+void *shstk_ptr;
+int fd;
+
+void reset_test_shstk(void *addr)
+{
+ if (shstk_ptr)
+ free_shstk(shstk_ptr);
+ shstk_ptr = create_shstk(addr);
+}
+
+void test_access_fix_handler(int signum, siginfo_t *si, void *uc)
+{
+ printf("[INFO]\tViolation from %s\n", is_shstk_access ? "shstk access" : "normal write");
+
+ segv_triggered = true;
+
+ /* Fix shadow stack */
+ if (is_shstk_access) {
+ reset_test_shstk(shstk_ptr);
+ return;
+ }
+
+ free_shstk(shstk_ptr);
+ create_normal_mem(shstk_ptr);
+}
+
+bool test_shstk_access(void *ptr)
+{
+ is_shstk_access = true;
+ segv_triggered = false;
+ write_shstk(ptr, MAGIC_VAL);
+
+ asm volatile("" : : : "memory");
+
+ return segv_triggered;
+}
+
+bool test_write_access(void *ptr)
+{
+ is_shstk_access = false;
+ segv_triggered = false;
+ *(unsigned long *)ptr = MAGIC_VAL;
+
+ asm volatile("" : : : "memory");
+
+ return segv_triggered;
+}
+
+bool gup_write(void *ptr)
+{
+ unsigned long val;
+
+ lseek(fd, (unsigned long)ptr, SEEK_SET);
+ if (write(fd, &val, sizeof(val)) < 0)
+ return 1;
+
+ return 0;
+}
+
+bool gup_read(void *ptr)
+{
+ unsigned long val;
+
+ lseek(fd, (unsigned long)ptr, SEEK_SET);
+ if (read(fd, &val, sizeof(val)) < 0)
+ return 1;
+
+ return 0;
+}
+
+int test_gup(void)
+{
+ struct sigaction sa;
+ int status;
+ pid_t pid;
+
+ sa.sa_sigaction = test_access_fix_handler;
+ if (sigaction(SIGSEGV, &sa, NULL))
+ return 1;
+ sa.sa_flags = SA_SIGINFO;
+
+ segv_triggered = false;
+
+ fd = open("/proc/self/mem", O_RDWR);
+ if (fd == -1)
+ return 1;
+
+ reset_test_shstk(0);
+ if (gup_read(shstk_ptr))
+ return 1;
+ if (test_shstk_access(shstk_ptr))
+ return 1;
+ printf("[INFO]\tGup read -> shstk access success\n");
+
+ reset_test_shstk(0);
+ if (gup_write(shstk_ptr))
+ return 1;
+ if (test_shstk_access(shstk_ptr))
+ return 1;
+ printf("[INFO]\tGup write -> shstk access success\n");
+
+ reset_test_shstk(0);
+ if (gup_read(shstk_ptr))
+ return 1;
+ if (!test_write_access(shstk_ptr))
+ return 1;
+ printf("[INFO]\tGup read -> write access success\n");
+
+ reset_test_shstk(0);
+ if (gup_write(shstk_ptr))
+ return 1;
+ if (!test_write_access(shstk_ptr))
+ return 1;
+ printf("[INFO]\tGup write -> write access success\n");
+
+ close(fd);
+
+ /* COW/gup test */
+ reset_test_shstk(0);
+ pid = fork();
+ if (!pid) {
+ fd = open("/proc/self/mem", O_RDWR);
+ if (fd == -1)
+ exit(1);
+
+ if (gup_write(shstk_ptr)) {
+ close(fd);
+ exit(1);
+ }
+ close(fd);
+ exit(0);
+ }
+ waitpid(pid, &status, 0);
+ if (WEXITSTATUS(status)) {
+ printf("[FAIL]\tWrite in child failed\n");
+ return 1;
+ }
+ if (*(unsigned long *)shstk_ptr == MAGIC_VAL) {
+ printf("[FAIL]\tWrite in child wrote through to shared memory\n");
+ return 1;
+ }
+
+ printf("[INFO]\tCow gup write -> write access success\n");
+
+ free_shstk(shstk_ptr);
+
+ signal(SIGSEGV, SIG_DFL);
+
+ printf("[OK]\tShadow gup test\n");
+
+ return 0;
+}
+
+int test_mprotect(void)
+{
+ struct sigaction sa;
+
+ sa.sa_sigaction = test_access_fix_handler;
+ if (sigaction(SIGSEGV, &sa, NULL))
+ return 1;
+ sa.sa_flags = SA_SIGINFO;
+
+ segv_triggered = false;
+
+ /* mprotect a shadow stack as read only */
+ reset_test_shstk(0);
+ if (mprotect(shstk_ptr, SS_SIZE, PROT_READ) < 0) {
+ printf("[FAIL]\tmprotect(PROT_READ) failed\n");
+ return 1;
+ }
+
+ /* try to wrss it and fail */
+ if (!test_shstk_access(shstk_ptr)) {
+ printf("[FAIL]\tShadow stack access to read-only memory succeeded\n");
+ return 1;
+ }
+
+ /*
+ * The shadow stack was reset above to resolve the fault, make the new one
+ * read-only.
+ */
+ if (mprotect(shstk_ptr, SS_SIZE, PROT_READ) < 0) {
+ printf("[FAIL]\tmprotect(PROT_READ) failed\n");
+ return 1;
+ }
+
+ /* then back to writable */
+ if (mprotect(shstk_ptr, SS_SIZE, PROT_WRITE | PROT_READ) < 0) {
+ printf("[FAIL]\tmprotect(PROT_WRITE) failed\n");
+ return 1;
+ }
+
+ /* then wrss to it and succeed */
+ if (test_shstk_access(shstk_ptr)) {
+ printf("[FAIL]\tShadow stack access to mprotect() writable memory failed\n");
+ return 1;
+ }
+
+ free_shstk(shstk_ptr);
+
+ signal(SIGSEGV, SIG_DFL);
+
+ printf("[OK]\tmprotect() test\n");
+
+ return 0;
+}
+
+char zero[4096];
+
+static void *uffd_thread(void *arg)
+{
+ struct uffdio_copy req;
+ int uffd = *(int *)arg;
+ struct uffd_msg msg;
+
+ if (read(uffd, &msg, sizeof(msg)) <= 0)
+ return (void *)1;
+
+ req.dst = msg.arg.pagefault.address;
+ req.src = (__u64)zero;
+ req.len = 4096;
+ req.mode = 0;
+
+ if (ioctl(uffd, UFFDIO_COPY, &req))
+ return (void *)1;
+
+ return (void *)0;
+}
+
+int test_userfaultfd(void)
+{
+ struct uffdio_register uffdio_register;
+ struct uffdio_api uffdio_api;
+ struct sigaction sa;
+ pthread_t thread;
+ void *res;
+ int uffd;
+
+ sa.sa_sigaction = test_access_fix_handler;
+ if (sigaction(SIGSEGV, &sa, NULL))
+ return 1;
+ sa.sa_flags = SA_SIGINFO;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ printf("[SKIP]\tUserfaultfd unavailable.\n");
+ return 0;
+ }
+
+ reset_test_shstk(0);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api))
+ goto err;
+
+ uffdio_register.range.start = (__u64)shstk_ptr;
+ uffdio_register.range.len = 4096;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
+ goto err;
+
+ if (pthread_create(&thread, NULL, &uffd_thread, &uffd))
+ goto err;
+
+ reset_shstk(shstk_ptr);
+ test_shstk_access(shstk_ptr);
+
+ if (pthread_join(thread, &res))
+ goto err;
+
+ if (test_shstk_access(shstk_ptr))
+ goto err;
+
+ free_shstk(shstk_ptr);
+
+ signal(SIGSEGV, SIG_DFL);
+
+ if (!res)
+ printf("[OK]\tUserfaultfd test\n");
+ return !!res;
+err:
+ free_shstk(shstk_ptr);
+ close(uffd);
+ signal(SIGSEGV, SIG_DFL);
+ return 1;
+}
+
+/*
+ * Too complicated to pull it out of the 32 bit header, but also get the
+ * 64 bit one needed above. Just define a copy here.
+ */
+#define __NR_compat_sigaction 67
+
+/*
+ * Call 32 bit signal handler to get 32 bit signals ABI. Make sure
+ * to push the registers that will get clobbered.
+ */
+int sigaction32(int signum, const struct sigaction *restrict act,
+ struct sigaction *restrict oldact)
+{
+ register long syscall_reg asm("eax") = __NR_compat_sigaction;
+ register long signum_reg asm("ebx") = signum;
+ register long act_reg asm("ecx") = (long)act;
+ register long oldact_reg asm("edx") = (long)oldact;
+ int ret = 0;
+
+ asm volatile ("int $0x80;"
+ : "=a"(ret), "=m"(oldact)
+ : "r"(syscall_reg), "r"(signum_reg), "r"(act_reg),
+ "r"(oldact_reg)
+ : "r8", "r9", "r10", "r11"
+ );
+
+ return ret;
+}
+
+sigjmp_buf jmp_buffer;
+
+void segv_gp_handler(int signum, siginfo_t *si, void *uc)
+{
+ segv_triggered = true;
+
+ /*
+ * To work with old glibc, this can't rely on siglongjmp working with
+ * shadow stack enabled, so disable shadow stack before siglongjmp().
+ */
+ ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK);
+ siglongjmp(jmp_buffer, -1);
+}
+
+/*
+ * Transition to 32 bit mode and check that a #GP triggers a segfault.
+ */
+int test_32bit(void)
+{
+ struct sigaction sa;
+ struct sigaction *sa32;
+
+ /* Create sigaction in 32 bit address range */
+ sa32 = mmap(0, 4096, PROT_READ | PROT_WRITE,
+ MAP_32BIT | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ sa32->sa_flags = SA_SIGINFO;
+
+ sa.sa_sigaction = segv_gp_handler;
+ if (sigaction(SIGSEGV, &sa, NULL))
+ return 1;
+ sa.sa_flags = SA_SIGINFO;
+
+ segv_triggered = false;
+
+ /* Make sure segv_triggered is set before triggering the #GP */
+ asm volatile("" : : : "memory");
+
+ /*
+ * Set handler to somewhere in 32 bit address space
+ */
+ sa32->sa_handler = (void *)sa32;
+ if (sigaction32(SIGUSR1, sa32, NULL))
+ return 1;
+
+ if (!sigsetjmp(jmp_buffer, 1))
+ raise(SIGUSR1);
+
+ if (segv_triggered)
+ printf("[OK]\t32 bit test\n");
+
+ return !segv_triggered;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret = 0;
+
+ if (ARCH_PRCTL(ARCH_SHSTK_ENABLE, ARCH_SHSTK_SHSTK)) {
+ printf("[SKIP]\tCould not enable Shadow stack\n");
+ return 1;
+ }
+
+ if (ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK)) {
+ ret = 1;
+ printf("[FAIL]\tDisabling shadow stack failed\n");
+ }
+
+ if (ARCH_PRCTL(ARCH_SHSTK_ENABLE, ARCH_SHSTK_SHSTK)) {
+ printf("[SKIP]\tCould not re-enable Shadow stack\n");
+ return 1;
+ }
+
+ if (ARCH_PRCTL(ARCH_SHSTK_ENABLE, ARCH_SHSTK_WRSS)) {
+ printf("[SKIP]\tCould not enable WRSS\n");
+ ret = 1;
+ goto out;
+ }
+
+ /* Should have succeeded if here, but this is a test, so double check. */
+ if (!get_ssp()) {
+ printf("[FAIL]\tShadow stack disabled\n");
+ return 1;
+ }
+
+ if (test_shstk_pivot()) {
+ ret = 1;
+ printf("[FAIL]\tShadow stack pivot\n");
+ goto out;
+ }
+
+ if (test_shstk_faults()) {
+ ret = 1;
+ printf("[FAIL]\tShadow stack fault test\n");
+ goto out;
+ }
+
+ if (test_shstk_violation()) {
+ ret = 1;
+ printf("[FAIL]\tShadow stack violation test\n");
+ goto out;
+ }
+
+ if (test_gup()) {
+ ret = 1;
+ printf("[FAIL]\tShadow shadow stack gup\n");
+ goto out;
+ }
+
+ if (test_mprotect()) {
+ ret = 1;
+ printf("[FAIL]\tShadow shadow mprotect test\n");
+ goto out;
+ }
+
+ if (test_userfaultfd()) {
+ ret = 1;
+ printf("[FAIL]\tUserfaultfd test\n");
+ goto out;
+ }
+
+ if (test_32bit()) {
+ ret = 1;
+ printf("[FAIL]\t32 bit test\n");
+ }
+
+ return ret;
+
+out:
+ /*
+ * Disable shadow stack before the function returns, or there will be a
+ * shadow stack violation.
+ */
+ if (ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK)) {
+ ret = 1;
+ printf("[FAIL]\tDisabling shadow stack failed\n");
+ }
+
+ return ret;
+}
+#endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b1679d08a216..537f33ac49de 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -62,11 +62,14 @@
#include "kvm_mm.h"
#include "vfio.h"
+#include <trace/events/ipi.h>
+
#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>
#include <linux/kvm_dirty_ring.h>
+
/* Worst case buffer size needed for holding an integer. */
#define ITOA_MAX_LEN 12