summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2020-06-24 12:39:50 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2020-06-24 12:39:51 +1000
commite3e5f7f48c27ddba468ad67f06b742e78d7e5ab6 (patch)
treed1ad683fb6a7780b879d7baaa4579e4d85b111be
parent5b3cfd00355b17e4465669a5c50e984f1c37f2b8 (diff)
parent9f2a2b6c23758d798f548911f6119def723d1029 (diff)
downloadlinux-next-e3e5f7f48c27ddba468ad67f06b742e78d7e5ab6.tar.gz
Merge remote-tracking branch 'tip/auto-latest'
-rw-r--r--Documentation/ABI/testing/sysfs-devices-mapping33
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--Documentation/dev-tools/kcsan.rst6
-rw-r--r--Documentation/x86/x86_64/fsgs.rst199
-rw-r--r--Documentation/x86/x86_64/index.rst1
-rw-r--r--arch/arm/common/bL_switcher.c3
-rw-r--r--arch/arm/include/asm/efi.h7
-rw-r--r--arch/x86/Kconfig5
-rw-r--r--arch/x86/boot/compressed/head_64.S11
-rw-r--r--arch/x86/entry/calling.h40
-rw-r--r--arch/x86/entry/entry_64.S139
-rw-r--r--arch/x86/events/Makefile2
-rw-r--r--arch/x86/events/amd/power.c4
-rw-r--r--arch/x86/events/intel/uncore.c26
-rw-r--r--arch/x86/events/intel/uncore.h37
-rw-r--r--arch/x86/events/intel/uncore_snb.c80
-rw-r--r--arch/x86/events/intel/uncore_snbep.c208
-rw-r--r--arch/x86/include/asm/asm.h6
-rw-r--r--arch/x86/include/asm/bitops.h6
-rw-r--r--arch/x86/include/asm/bug.h6
-rw-r--r--arch/x86/include/asm/cpu.h5
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/cpumask.h18
-rw-r--r--arch/x86/include/asm/div64.h14
-rw-r--r--arch/x86/include/asm/fsgsbase.h45
-rw-r--r--arch/x86/include/asm/inst.h15
-rw-r--r--arch/x86/include/asm/kprobes.h2
-rw-r--r--arch/x86/include/asm/mem_encrypt.h5
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/pgtable.h9
-rw-r--r--arch/x86/include/asm/processor.h6
-rw-r--r--arch/x86/include/asm/stackprotector.h12
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/uapi/asm/hwcap2.h3
-rw-r--r--arch/x86/kernel/alternative.c41
-rw-r--r--arch/x86/kernel/cpu/bugs.c19
-rw-r--r--arch/x86/kernel/cpu/centaur.c1
-rw-r--r--arch/x86/kernel/cpu/common.c46
-rw-r--r--arch/x86/kernel/cpu/cpu.h4
-rw-r--r--arch/x86/kernel/cpu/mce/core.c2
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c2
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c8
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h1
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c1
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c1
-rw-r--r--arch/x86/kernel/idt.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c15
-rw-r--r--arch/x86/kernel/kprobes/opt.c38
-rw-r--r--arch/x86/kernel/msr.c69
-rw-r--r--arch/x86/kernel/nmi.c2
-rw-r--r--arch/x86/kernel/process.c10
-rw-r--r--arch/x86/kernel/process_64.c123
-rw-r--r--arch/x86/kernel/ptrace.c17
-rw-r--r--arch/x86/kernel/smpboot.c64
-rw-r--r--arch/x86/kvm/vmx/vmx.c2
-rw-r--r--arch/x86/lib/usercopy_64.c1
-rw-r--r--arch/x86/mm/init.c3
-rw-r--r--arch/x86/mm/mem_encrypt.c2
-rw-r--r--arch/x86/power/cpu.c6
-rw-r--r--arch/x86/xen/smp_pv.c2
-rw-r--r--crypto/crypto_engine.c3
-rw-r--r--drivers/acpi/acpi_pad.c3
-rw-r--r--drivers/block/drbd/drbd_receiver.c5
-rw-r--r--drivers/firmware/efi/Kconfig11
-rw-r--r--drivers/firmware/efi/arm-init.c40
-rw-r--r--drivers/firmware/efi/efi.c5
-rw-r--r--drivers/firmware/efi/esrt.c2
-rw-r--r--drivers/firmware/efi/libstub/Makefile3
-rw-r--r--drivers/firmware/efi/libstub/arm32-stub.c54
-rw-r--r--drivers/firmware/efi/libstub/efi-stub-helper.c78
-rw-r--r--drivers/firmware/efi/libstub/efi-stub.c3
-rw-r--r--drivers/firmware/efi/libstub/efistub.h12
-rw-r--r--drivers/firmware/efi/libstub/file.c16
-rw-r--r--drivers/firmware/efi/libstub/skip_spaces.c1
-rw-r--r--drivers/firmware/psci/psci_checker.c10
-rw-r--r--drivers/gpu/drm/msm/msm_drv.c13
-rw-r--r--drivers/gpu/drm/scheduler/sched_main.c3
-rw-r--r--drivers/hwmon/fam15h_power.c4
-rw-r--r--drivers/media/pci/ivtv/ivtv-driver.c4
-rw-r--r--drivers/mmc/core/sdio_irq.c3
-rw-r--r--drivers/platform/chrome/cros_ec_spi.c11
-rw-r--r--drivers/powercap/idle_inject.c4
-rw-r--r--drivers/spi/spi.c4
-rw-r--r--drivers/staging/android/ion/ion_heap.c4
-rw-r--r--drivers/thermal/intel/intel_powerclamp.c5
-rw-r--r--drivers/tty/serial/sc16is7xx.c3
-rw-r--r--drivers/watchdog/watchdog_dev.c3
-rw-r--r--fs/efivarfs/file.c7
-rw-r--r--include/linux/compiler-clang.h8
-rw-r--r--include/linux/compiler-gcc.h6
-rw-r--r--include/linux/compiler_attributes.h1
-rw-r--r--include/linux/compiler_types.h14
-rw-r--r--include/linux/cpu.h1
-rw-r--r--include/linux/efi.h8
-rw-r--r--include/linux/ftrace.h12
-rw-r--r--include/linux/irq_work.h26
-rw-r--r--include/linux/kprobes.h15
-rw-r--r--include/linux/lockdep.h178
-rw-r--r--include/linux/lockdep_types.h196
-rw-r--r--include/linux/math64.h2
-rw-r--r--include/linux/perf_event.h8
-rw-r--r--include/linux/psi_types.h7
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/linux/sched/isolation.h1
-rw-r--r--include/linux/sched/sysctl.h3
-rw-r--r--include/linux/smp.h23
-rw-r--r--include/linux/smp_types.h66
-rw-r--r--include/linux/spinlock.h1
-rw-r--r--include/linux/spinlock_types.h2
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--include/linux/timekeeping.h14
-rw-r--r--include/linux/tpm_eventlog.h14
-rw-r--r--include/trace/events/sched.h8
-rw-r--r--include/uapi/linux/perf_event.h26
-rw-r--r--init/do_mounts_initrd.c5
-rw-r--r--kernel/events/core.c90
-rw-r--r--kernel/irq/manage.c6
-rw-r--r--kernel/kallsyms.c42
-rw-r--r--kernel/kprobes.c57
-rw-r--r--kernel/kthread.c6
-rw-r--r--kernel/locking/locktorture.c10
-rw-r--r--kernel/rcu/rcuperf.c10
-rw-r--r--kernel/rcu/rcutorture.c7
-rw-r--r--kernel/sched/core.c95
-rw-r--r--kernel/sched/cpudeadline.c24
-rw-r--r--kernel/sched/cputime.c46
-rw-r--r--kernel/sched/deadline.c115
-rw-r--r--kernel/sched/fair.c67
-rw-r--r--kernel/sched/idle.c7
-rw-r--r--kernel/sched/isolation.c3
-rw-r--r--kernel/sched/pelt.c4
-rw-r--r--kernel/sched/pelt.h5
-rw-r--r--kernel/sched/psi.c110
-rw-r--r--kernel/sched/sched.h25
-rw-r--r--kernel/sched/stop_task.c8
-rw-r--r--kernel/sched/topology.c2
-rw-r--r--kernel/smp.c18
-rw-r--r--kernel/sysctl.c14
-rw-r--r--kernel/trace/ftrace.c101
-rw-r--r--lib/Kconfig.kasan4
-rw-r--r--lib/math/div64.c41
-rw-r--r--tools/objtool/arch.h4
-rw-r--r--tools/objtool/arch/x86/decode.c20
-rw-r--r--tools/objtool/arch/x86/include/arch_elf.h6
-rw-r--r--tools/objtool/check.c247
-rw-r--r--tools/objtool/check.h2
-rw-r--r--tools/objtool/elf.c333
-rw-r--r--tools/objtool/elf.h37
-rw-r--r--tools/objtool/orc_gen.c46
-rw-r--r--tools/objtool/special.c28
-rw-r--r--tools/testing/selftests/x86/fsgsbase.c24
-rw-r--r--tools/testing/selftests/x86/syscall_arg_fault.c26
154 files changed, 3095 insertions, 1013 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping
new file mode 100644
index 000000000000..490ccfd67f12
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-mapping
@@ -0,0 +1,33 @@
+What: /sys/devices/uncore_iio_x/dieX
+Date: February 2020
+Contact: Roman Sudarikov <roman.sudarikov@linux.intel.com>
+Description:
+ Each IIO stack (PCIe root port) has its own IIO PMON block, so
+ each dieX file (where X is die number) holds "Segment:Root Bus"
+ for PCIe root port, which can be monitored by that IIO PMON
+ block.
+ For example, on 4-die Xeon platform with up to 6 IIO stacks per
+ die and, therefore, 6 IIO PMON blocks per die, the mapping of
+ IIO PMON block 0 exposes as the following:
+
+ $ ls /sys/devices/uncore_iio_0/die*
+ -r--r--r-- /sys/devices/uncore_iio_0/die0
+ -r--r--r-- /sys/devices/uncore_iio_0/die1
+ -r--r--r-- /sys/devices/uncore_iio_0/die2
+ -r--r--r-- /sys/devices/uncore_iio_0/die3
+
+ $ tail /sys/devices/uncore_iio_0/die*
+ ==> /sys/devices/uncore_iio_0/die0 <==
+ 0000:00
+ ==> /sys/devices/uncore_iio_0/die1 <==
+ 0000:40
+ ==> /sys/devices/uncore_iio_0/die2 <==
+ 0000:80
+ ==> /sys/devices/uncore_iio_0/die3 <==
+ 0000:c0
+
+ Which means:
+ IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x0000
+ IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000
+ IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000
+ IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b5beb1fe78cf..bd6cd95e9a90 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3081,6 +3081,8 @@
no5lvl [X86-64] Disable 5-level paging mode. Forces
kernel to use 4-level paging instead.
+ nofsgsbase [X86] Disables FSGSBASE instructions.
+
no_console_suspend
[HW] Never suspend the console
Disable suspending of consoles during suspend and
diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst
index ce4bbd918648..b38379f06194 100644
--- a/Documentation/dev-tools/kcsan.rst
+++ b/Documentation/dev-tools/kcsan.rst
@@ -114,12 +114,6 @@ the below options are available:
To dynamically limit for which functions to generate reports, see the
`DebugFS interface`_ blacklist/whitelist feature.
- For ``__always_inline`` functions, replace ``__always_inline`` with
- ``__no_kcsan_or_inline`` (which implies ``__always_inline``)::
-
- static __no_kcsan_or_inline void foo(void) {
- ...
-
* To disable data race detection for a particular compilation unit, add to the
``Makefile``::
diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/x86/x86_64/fsgs.rst
new file mode 100644
index 000000000000..50960e09e1f6
--- /dev/null
+++ b/Documentation/x86/x86_64/fsgs.rst
@@ -0,0 +1,199 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Using FS and GS segments in user space applications
+===================================================
+
+The x86 architecture supports segmentation. Instructions which access
+memory can use segment register based addressing mode. The following
+notation is used to address a byte within a segment:
+
+ Segment-register:Byte-address
+
+The segment base address is added to the Byte-address to compute the
+resulting virtual address which is accessed. This allows to access multiple
+instances of data with the identical Byte-address, i.e. the same code. The
+selection of a particular instance is purely based on the base-address in
+the segment register.
+
+In 32-bit mode the CPU provides 6 segments, which also support segment
+limits. The limits can be used to enforce address space protections.
+
+In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is
+always 0 to provide a full 64bit address space. The FS and GS segments are
+still functional in 64-bit mode.
+
+Common FS and GS usage
+------------------------------
+
+The FS segment is commonly used to address Thread Local Storage (TLS). FS
+is usually managed by runtime code or a threading library. Variables
+declared with the '__thread' storage class specifier are instantiated per
+thread and the compiler emits the FS: address prefix for accesses to these
+variables. Each thread has its own FS base address so common code can be
+used without complex address offset calculations to access the per thread
+instances. Applications should not use FS for other purposes when they use
+runtimes or threading libraries which manage the per thread FS.
+
+The GS segment has no common use and can be used freely by
+applications. GCC and Clang support GS based addressing via address space
+identifiers.
+
+Reading and writing the FS/GS base address
+------------------------------------------
+
+There exist two mechanisms to read and write the FS/GS base address:
+
+ - the arch_prctl() system call
+
+ - the FSGSBASE instruction family
+
+Accessing FS/GS base with arch_prctl()
+--------------------------------------
+
+ The arch_prctl(2) based mechanism is available on all 64-bit CPUs and all
+ kernel versions.
+
+ Reading the base:
+
+ arch_prctl(ARCH_GET_FS, &fsbase);
+ arch_prctl(ARCH_GET_GS, &gsbase);
+
+ Writing the base:
+
+ arch_prctl(ARCH_SET_FS, fsbase);
+ arch_prctl(ARCH_SET_GS, gsbase);
+
+ The ARCH_SET_GS prctl may be disabled depending on kernel configuration
+ and security settings.
+
+Accessing FS/GS base with the FSGSBASE instructions
+---------------------------------------------------
+
+ With the Ivy Bridge CPU generation Intel introduced a new set of
+ instructions to access the FS and GS base registers directly from user
+ space. These instructions are also supported on AMD Family 17H CPUs. The
+ following instructions are available:
+
+ =============== ===========================
+ RDFSBASE %reg Read the FS base register
+ RDGSBASE %reg Read the GS base register
+ WRFSBASE %reg Write the FS base register
+ WRGSBASE %reg Write the GS base register
+ =============== ===========================
+
+ The instructions avoid the overhead of the arch_prctl() syscall and allow
+ more flexible usage of the FS/GS addressing modes in user space
+ applications. This does not prevent conflicts between threading libraries
+ and runtimes which utilize FS and applications which want to use it for
+ their own purpose.
+
+FSGSBASE instructions enablement
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If
+ available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs.
+
+ The availability of the instructions does not enable them
+ automatically. The kernel has to enable them explicitly in CR4. The
+ reason for this is that older kernels make assumptions about the values in
+ the GS register and enforce them when GS base is set via
+ arch_prctl(). Allowing user space to write arbitrary values to GS base
+ would violate these assumptions and cause malfunction.
+
+ On kernels which do not enable FSGSBASE the execution of the FSGSBASE
+ instructions will fault with a #UD exception.
+
+ The kernel provides reliable information about the enabled state in the
+ ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the
+ kernel has FSGSBASE instructions enabled and applications can use them.
+ The following code example shows how this detection works::
+
+ #include <sys/auxv.h>
+ #include <elf.h>
+
+ /* Will be eventually in asm/hwcap.h */
+ #ifndef HWCAP2_FSGSBASE
+ #define HWCAP2_FSGSBASE (1 << 1)
+ #endif
+
+ ....
+
+ unsigned val = getauxval(AT_HWCAP2);
+
+ if (val & HWCAP2_FSGSBASE)
+ printf("FSGSBASE enabled\n");
+
+FSGSBASE instructions compiler support
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE
+instructions. Clang 5 supports them as well.
+
+ =================== ===========================
+ _readfsbase_u64() Read the FS base register
+ _readfsbase_u64() Read the GS base register
+ _writefsbase_u64() Write the FS base register
+ _writegsbase_u64() Write the GS base register
+ =================== ===========================
+
+To utilize these instrinsics <immintrin.h> must be included in the source
+code and the compiler option -mfsgsbase has to be added.
+
+Compiler support for FS/GS based addressing
+-------------------------------------------
+
+GCC version 6 and newer provide support for FS/GS based addressing via
+Named Address Spaces. GCC implements the following address space
+identifiers for x86:
+
+ ========= ====================================
+ __seg_fs Variable is addressed relative to FS
+ __seg_gs Variable is addressed relative to GS
+ ========= ====================================
+
+The preprocessor symbols __SEG_FS and __SEG_GS are defined when these
+address spaces are supported. Code which implements fallback modes should
+check whether these symbols are defined. Usage example::
+
+ #ifdef __SEG_GS
+
+ long data0 = 0;
+ long data1 = 1;
+
+ long __seg_gs *ptr;
+
+ /* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */
+ ....
+
+ /* Set GS base to point to data0 */
+ _writegsbase_u64(&data0);
+
+ /* Access offset 0 of GS */
+ ptr = 0;
+ printf("data0 = %ld\n", *ptr);
+
+ /* Set GS base to point to data1 */
+ _writegsbase_u64(&data1);
+ /* ptr still addresses offset 0! */
+ printf("data1 = %ld\n", *ptr);
+
+
+Clang does not provide the GCC address space identifiers, but it provides
+address spaces via an attribute based mechanism in Clang 2.6 and newer
+versions:
+
+ ==================================== =====================================
+ __attribute__((address_space(256)) Variable is addressed relative to GS
+ __attribute__((address_space(257)) Variable is addressed relative to FS
+ ==================================== =====================================
+
+FS/GS based addressing with inline assembly
+-------------------------------------------
+
+In case the compiler does not support address spaces, inline assembly can
+be used for FS/GS based addressing mode::
+
+ mov %fs:offset, %reg
+ mov %gs:offset, %reg
+
+ mov %reg, %fs:offset
+ mov %reg, %gs:offset
diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/x86/x86_64/index.rst
index d6eaaa5a35fc..a56070fc8e77 100644
--- a/Documentation/x86/x86_64/index.rst
+++ b/Documentation/x86/x86_64/index.rst
@@ -14,3 +14,4 @@ x86_64 Support
fake-numa-for-cpusets
cpu-hotplug-spec
machinecheck
+ fsgs
diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c
index 746e1fce777e..9a9aa53547a6 100644
--- a/arch/arm/common/bL_switcher.c
+++ b/arch/arm/common/bL_switcher.c
@@ -270,12 +270,11 @@ static struct bL_thread bL_threads[NR_CPUS];
static int bL_switcher_thread(void *arg)
{
struct bL_thread *t = arg;
- struct sched_param param = { .sched_priority = 1 };
int cluster;
bL_switch_completion_handler completer;
void *completer_cookie;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+ sched_set_fifo_low(current);
complete(&t->started);
do {
diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h
index 84dc0ba822f5..5dcf3c6011b7 100644
--- a/arch/arm/include/asm/efi.h
+++ b/arch/arm/include/asm/efi.h
@@ -87,4 +87,11 @@ static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base,
return dram_base + SZ_512M;
}
+struct efi_arm_entry_state {
+ u32 cpsr_before_ebs;
+ u32 sctlr_before_ebs;
+ u32 cpsr_after_ebs;
+ u32 sctlr_after_ebs;
+};
+
#endif /* _ASM_ARM_EFI_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6a0cc524882d..0fe0446102a7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -67,7 +67,7 @@ config X86
select ARCH_HAS_FILTER_PGPROT
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_KCOV if X86_64
+ select ARCH_HAS_KCOV if X86_64 && STACK_VALIDATION
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
@@ -1292,7 +1292,6 @@ config MICROCODE
bool "CPU microcode loading support"
default y
depends on CPU_SUP_AMD || CPU_SUP_INTEL
- select FW_LOADER
help
If you say Y here, you will be able to update the microcode on
Intel and AMD processors. The Intel support is for the IA32 family,
@@ -1314,7 +1313,6 @@ config MICROCODE_INTEL
bool "Intel microcode loading support"
depends on MICROCODE
default MICROCODE
- select FW_LOADER
help
This options enables microcode patch loading support for Intel
processors.
@@ -1326,7 +1324,6 @@ config MICROCODE_INTEL
config MICROCODE_AMD
bool "AMD microcode loading support"
depends on MICROCODE
- select FW_LOADER
help
If you select this option, microcode patch loading support for AMD
processors will be enabled.
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index e821a7d7d5c4..97d37f0a34f5 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -213,7 +213,6 @@ SYM_FUNC_START(startup_32)
* We place all of the values on our mini stack so lret can
* used to perform that far jump.
*/
- pushl $__KERNEL_CS
leal startup_64(%ebp), %eax
#ifdef CONFIG_EFI_MIXED
movl efi32_boot_args(%ebp), %edi
@@ -224,11 +223,20 @@ SYM_FUNC_START(startup_32)
movl efi32_boot_args+8(%ebp), %edx // saved bootparams pointer
cmpl $0, %edx
jnz 1f
+ /*
+ * efi_pe_entry uses MS calling convention, which requires 32 bytes of
+ * shadow space on the stack even if all arguments are passed in
+ * registers. We also need an additional 8 bytes for the space that
+ * would be occupied by the return address, and this also results in
+ * the correct stack alignment for entry.
+ */
+ subl $40, %esp
leal efi_pe_entry(%ebp), %eax
movl %edi, %ecx // MS calling convention
movl %esi, %edx
1:
#endif
+ pushl $__KERNEL_CS
pushl %eax
/* Enter paged protected Mode, activating Long Mode */
@@ -784,6 +792,7 @@ SYM_DATA_LOCAL(boot_heap, .fill BOOT_HEAP_SIZE, 1, 0)
SYM_DATA_START_LOCAL(boot_stack)
.fill BOOT_STACK_SIZE, 1, 0
+ .balign 16
SYM_DATA_END_LABEL(boot_stack, SYM_L_LOCAL, boot_stack_end)
/*
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 4208c1e3f601..98e4d8886f11 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -6,6 +6,7 @@
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
+#include <asm/inst.h>
/*
@@ -341,6 +342,12 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm
+.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
+ rdgsbase \save_reg
+ GET_PERCPU_BASE \scratch_reg
+ wrgsbase \scratch_reg
+.endm
+
#else /* CONFIG_X86_64 */
# undef UNWIND_HINT_IRET_REGS
# define UNWIND_HINT_IRET_REGS
@@ -351,3 +358,36 @@ For 32-bit we have the following conventions - kernel is built with
call stackleak_erase
#endif
.endm
+
+#ifdef CONFIG_SMP
+
+/*
+ * CPU/node NR is loaded from the limit (size) field of a special segment
+ * descriptor entry in GDT.
+ */
+.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req
+ movq $__CPUNODE_SEG, \reg
+ lsl \reg, \reg
+.endm
+
+/*
+ * Fetch the per-CPU GSBASE value for this processor and put it in @reg.
+ * We normally use %gs for accessing per-CPU data, but we are setting up
+ * %gs here and obviously can not use %gs itself to access per-CPU data.
+ */
+.macro GET_PERCPU_BASE reg:req
+ ALTERNATIVE \
+ "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \
+ "RDPID \reg", \
+ X86_FEATURE_RDPID
+ andq $VDSO_CPUNODE_MASK, \reg
+ movq __per_cpu_offset(, \reg, 8), \reg
+.endm
+
+#else
+
+.macro GET_PERCPU_BASE reg:req
+ movq pcpu_unit_offsets(%rip), \reg
+.endm
+
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d2a00c97e53f..fb729f4c4fbc 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -38,6 +38,7 @@
#include <asm/frame.h>
#include <asm/trapnr.h>
#include <asm/nospec-branch.h>
+#include <asm/fsgsbase.h>
#include <linux/err.h>
#include "calling.h"
@@ -426,10 +427,7 @@ SYM_CODE_START(\asmsym)
testb $3, CS-ORIG_RAX(%rsp)
jnz .Lfrom_usermode_switch_stack_\@
- /*
- * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
- * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
- */
+ /* paranoid_entry returns GS information for paranoid_exit in EBX. */
call paranoid_entry
UNWIND_HINT_REGS
@@ -458,10 +456,7 @@ SYM_CODE_START(\asmsym)
UNWIND_HINT_IRET_REGS offset=8
ASM_CLAC
- /*
- * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
- * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
- */
+ /* paranoid_entry returns GS information for paranoid_exit in EBX. */
call paranoid_entry
UNWIND_HINT_REGS
@@ -798,24 +793,21 @@ SYM_CODE_END(xen_failsafe_callback)
#endif /* CONFIG_XEN_PV */
/*
- * Save all registers in pt_regs, and switch gs if needed.
- * Use slow, but surefire "are we in kernel?" check.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ * Save all registers in pt_regs. Return GSBASE related information
+ * in EBX depending on the availability of the FSGSBASE instructions:
+ *
+ * FSGSBASE R/EBX
+ * N 0 -> SWAPGS on exit
+ * 1 -> no SWAPGS on exit
+ *
+ * Y GSBASE value at entry, must be restored in paranoid_exit
*/
SYM_CODE_START_LOCAL(paranoid_entry)
UNWIND_HINT_FUNC
cld
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
- movl $1, %ebx
- movl $MSR_GS_BASE, %ecx
- rdmsr
- testl %edx, %edx
- js 1f /* negative -> in kernel */
- SWAPGS
- xorl %ebx, %ebx
-1:
/*
* Always stash CR3 in %r14. This value will be restored,
* verbatim, at exit. Needed if paranoid_entry interrupted
@@ -825,16 +817,60 @@ SYM_CODE_START_LOCAL(paranoid_entry)
* This is also why CS (stashed in the "iret frame" by the
* hardware at entry) can not be used: this may be a return
* to kernel code, but with a user CR3 value.
+ *
+ * Switching CR3 does not depend on kernel GSBASE so it can
+ * be done before switching to the kernel GSBASE. This is
+ * required for FSGSBASE because the kernel GSBASE has to
+ * be retrieved from a kernel internal table.
*/
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
/*
+ * Handling GSBASE depends on the availability of FSGSBASE.
+ *
+ * Without FSGSBASE the kernel enforces that negative GSBASE
+ * values indicate kernel GSBASE. With FSGSBASE no assumptions
+ * can be made about the GSBASE value when entering from user
+ * space.
+ */
+ ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
+
+ /*
+ * Read the current GSBASE and store it in %rbx unconditionally,
+ * retrieve and set the current CPUs kernel GSBASE. The stored value
+ * has to be restored in paranoid_exit unconditionally.
+ *
+ * The MSR write ensures that no subsequent load is based on a
+ * mispredicted GSBASE. No extra FENCE required.
+ */
+ SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
+ ret
+
+.Lparanoid_entry_checkgs:
+ /* EBX = 1 -> kernel GSBASE active, no restore required */
+ movl $1, %ebx
+ /*
+ * The kernel-enforced convention is a negative GSBASE indicates
+ * a kernel value. No SWAPGS needed on entry and exit.
+ */
+ movl $MSR_GS_BASE, %ecx
+ rdmsr
+ testl %edx, %edx
+ jns .Lparanoid_entry_swapgs
+ ret
+
+.Lparanoid_entry_swapgs:
+ SWAPGS
+
+ /*
* The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
* unconditional CR3 write, even in the PTI case. So do an lfence
* to prevent GS speculation, regardless of whether PTI is enabled.
*/
FENCE_SWAPGS_KERNEL_ENTRY
+ /* EBX = 0 -> SWAPGS required on exit */
+ xorl %ebx, %ebx
ret
SYM_CODE_END(paranoid_entry)
@@ -845,23 +881,45 @@ SYM_CODE_END(paranoid_entry)
*
* We may be returning to very strange contexts (e.g. very early
* in syscall entry), so checking for preemption here would
- * be complicated. Fortunately, we there's no good reason
- * to try to handle preemption here.
+ * be complicated. Fortunately, there's no good reason to try
+ * to handle preemption here.
+ *
+ * R/EBX contains the GSBASE related information depending on the
+ * availability of the FSGSBASE instructions:
+ *
+ * FSGSBASE R/EBX
+ * N 0 -> SWAPGS on exit
+ * 1 -> no SWAPGS on exit
*
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
+ * Y User space GSBASE, must be restored unconditionally
*/
SYM_CODE_START_LOCAL(paranoid_exit)
UNWIND_HINT_REGS
- testl %ebx, %ebx /* swapgs needed? */
- jnz .Lparanoid_exit_no_swapgs
- /* Always restore stashed CR3 value (see paranoid_entry) */
- RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
+ /*
+ * The order of operations is important. RESTORE_CR3 requires
+ * kernel GSBASE.
+ *
+ * NB to anyone to try to optimize this code: this code does
+ * not execute at all for exceptions from user mode. Those
+ * exceptions go through error_exit instead.
+ */
+ RESTORE_CR3 scratch_reg=%rax save_reg=%r14
+
+ /* Handle the three GSBASE cases */
+ ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
+
+ /* With FSGSBASE enabled, unconditionally restore GSBASE */
+ wrgsbase %rbx
+ jmp restore_regs_and_return_to_kernel
+
+.Lparanoid_exit_checkgs:
+ /* On non-FSGSBASE systems, conditionally do SWAPGS */
+ testl %ebx, %ebx
+ jnz restore_regs_and_return_to_kernel
+
+ /* We are returning to a context with user GSBASE */
SWAPGS_UNSAFE_STACK
- jmp restore_regs_and_return_to_kernel
-.Lparanoid_exit_no_swapgs:
- /* Always restore stashed CR3 value (see paranoid_entry) */
- RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
- jmp restore_regs_and_return_to_kernel
+ jmp restore_regs_and_return_to_kernel
SYM_CODE_END(paranoid_exit)
/*
@@ -1266,10 +1324,27 @@ end_repeat_nmi:
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
- testl %ebx, %ebx /* swapgs needed? */
+ /*
+ * The above invocation of paranoid_entry stored the GSBASE
+ * related information in R/EBX depending on the availability
+ * of FSGSBASE.
+ *
+ * If FSGSBASE is enabled, restore the saved GSBASE value
+ * unconditionally, otherwise take the conditional SWAPGS path.
+ */
+ ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
+
+ wrgsbase %rbx
+ jmp nmi_restore
+
+nmi_no_fsgsbase:
+ /* EBX == 0 -> invoke SWAPGS */
+ testl %ebx, %ebx
jnz nmi_restore
+
nmi_swapgs:
SWAPGS_UNSAFE_STACK
+
nmi_restore:
POP_REGS
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 12c42eba77ec..9933c0e8e97a 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y += core.o probe.o
-obj-$(PERF_EVENTS_INTEL_RAPL) += rapl.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += rapl.o
obj-y += amd/
obj-$(CONFIG_X86_LOCAL_APIC) += msr.o
obj-$(CONFIG_CPU_SUP_INTEL) += intel/
diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c
index 43b09e9c93a2..16a2369c586e 100644
--- a/arch/x86/events/amd/power.c
+++ b/arch/x86/events/amd/power.c
@@ -13,10 +13,6 @@
#include <asm/cpu_device_id.h>
#include "../perf_event.h"
-#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a
-#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b
-#define MSR_F15H_PTSC 0xc0010280
-
/* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */
#define AMD_POWER_EVENT_MASK 0xFFULL
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index cf76d6631afa..d5c6d3b340c5 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -16,7 +16,7 @@ struct pci_driver *uncore_pci_driver;
DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
struct pci_extra_dev *uncore_extra_pci_dev;
-static int max_dies;
+int __uncore_max_dies;
/* mask of cpus that collect uncore events */
static cpumask_t uncore_cpu_mask;
@@ -108,7 +108,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu
* The unsigned check also catches the '-1' return value for non
* existent mappings in the topology map.
*/
- return dieid < max_dies ? pmu->boxes[dieid] : NULL;
+ return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL;
}
u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
@@ -132,6 +132,9 @@ u64 uncore_mmio_read_counter(struct intel_uncore_box *box,
if (!box->io_addr)
return 0;
+ if (!uncore_mmio_is_valid_offset(box, event->hw.event_base))
+ return 0;
+
return readq(box->io_addr + event->hw.event_base);
}
@@ -843,10 +846,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
.read = uncore_pmu_event_read,
.module = THIS_MODULE,
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ .attr_update = pmu->type->attr_update,
};
} else {
pmu->pmu = *pmu->type->pmu;
pmu->pmu.attr_groups = pmu->type->attr_groups;
+ pmu->pmu.attr_update = pmu->type->attr_update;
}
if (pmu->type->num_boxes == 1) {
@@ -877,7 +882,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
{
int die;
- for (die = 0; die < max_dies; die++)
+ for (die = 0; die < uncore_max_dies(); die++)
kfree(pmu->boxes[die]);
kfree(pmu->boxes);
}
@@ -887,6 +892,9 @@ static void uncore_type_exit(struct intel_uncore_type *type)
struct intel_uncore_pmu *pmu = type->pmus;
int i;
+ if (type->cleanup_mapping)
+ type->cleanup_mapping(type);
+
if (pmu) {
for (i = 0; i < type->num_boxes; i++, pmu++) {
uncore_pmu_unregister(pmu);
@@ -915,7 +923,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
if (!pmus)
return -ENOMEM;
- size = max_dies * sizeof(struct intel_uncore_box *);
+ size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
for (i = 0; i < type->num_boxes; i++) {
pmus[i].func_id = setid ? i : -1;
@@ -954,6 +962,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
type->pmu_group = &uncore_pmu_attr_group;
+ if (type->set_mapping)
+ type->set_mapping(type);
+
return 0;
err:
@@ -1112,7 +1123,7 @@ static int __init uncore_pci_init(void)
size_t size;
int ret;
- size = max_dies * sizeof(struct pci_extra_dev);
+ size = uncore_max_dies() * sizeof(struct pci_extra_dev);
uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
if (!uncore_extra_pci_dev) {
ret = -ENOMEM;
@@ -1514,6 +1525,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init),
@@ -1539,7 +1552,8 @@ static int __init intel_uncore_init(void)
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return -ENODEV;
- max_dies = topology_max_packages() * topology_max_die_per_package();
+ __uncore_max_dies =
+ topology_max_packages() * topology_max_die_per_package();
uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
if (uncore_init->pci_init) {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index b469ddd45515..105fdc69825e 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -61,6 +61,7 @@ struct intel_uncore_type {
unsigned msr_offset;
unsigned mmio_offset;
};
+ unsigned mmio_map_size;
unsigned num_shared_regs:8;
unsigned single_fixed:1;
unsigned pair_ctr_ctl:1;
@@ -72,7 +73,19 @@ struct intel_uncore_type {
struct uncore_event_desc *event_descs;
struct freerunning_counters *freerunning;
const struct attribute_group *attr_groups[4];
+ const struct attribute_group **attr_update;
struct pmu *pmu; /* for custom pmu ops */
+ /*
+ * Uncore PMU would store relevant platform topology configuration here
+ * to identify which platform component each PMON block of that type is
+ * supposed to monitor.
+ */
+ u64 *topology;
+ /*
+ * Optional callbacks for managing mapping of Uncore units to PMONs
+ */
+ int (*set_mapping)(struct intel_uncore_type *type);
+ void (*cleanup_mapping)(struct intel_uncore_type *type);
};
#define pmu_group attr_groups[0]
@@ -169,6 +182,18 @@ int uncore_pcibus_to_physid(struct pci_bus *bus);
ssize_t uncore_event_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
+static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev)
+{
+ return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu);
+}
+
+#define to_device_attribute(n) container_of(n, struct device_attribute, attr)
+#define to_dev_ext_attribute(n) container_of(n, struct dev_ext_attribute, attr)
+#define attr_to_ext_attr(n) to_dev_ext_attribute(to_device_attribute(n))
+
+extern int __uncore_max_dies;
+#define uncore_max_dies() (__uncore_max_dies)
+
#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
{ \
.attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
@@ -196,6 +221,18 @@ static inline bool uncore_pmc_freerunning(int idx)
return idx == UNCORE_PMC_IDX_FREERUNNING;
}
+static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box,
+ unsigned long offset)
+{
+ if (offset < box->pmu->type->mmio_map_size)
+ return true;
+
+ pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n",
+ offset, box->pmu->type->name);
+
+ return false;
+}
+
static inline
unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box)
{
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 3de1065eefc4..cb94ba86efd2 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -42,6 +42,17 @@
#define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC 0x3ed0
#define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC 0x3e34
#define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35
+#define PCI_DEVICE_ID_INTEL_CML_H1_IMC 0x9b44
+#define PCI_DEVICE_ID_INTEL_CML_H2_IMC 0x9b54
+#define PCI_DEVICE_ID_INTEL_CML_H3_IMC 0x9b64
+#define PCI_DEVICE_ID_INTEL_CML_U1_IMC 0x9b51
+#define PCI_DEVICE_ID_INTEL_CML_U2_IMC 0x9b61
+#define PCI_DEVICE_ID_INTEL_CML_U3_IMC 0x9b71
+#define PCI_DEVICE_ID_INTEL_CML_S1_IMC 0x9b33
+#define PCI_DEVICE_ID_INTEL_CML_S2_IMC 0x9b43
+#define PCI_DEVICE_ID_INTEL_CML_S3_IMC 0x9b53
+#define PCI_DEVICE_ID_INTEL_CML_S4_IMC 0x9b63
+#define PCI_DEVICE_ID_INTEL_CML_S5_IMC 0x9b73
#define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02
#define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12
#define PCI_DEVICE_ID_INTEL_TGL_U1_IMC 0x9a02
@@ -415,6 +426,7 @@ static const struct attribute_group snb_uncore_imc_format_group = {
static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
{
+ struct intel_uncore_type *type = box->pmu->type;
struct pci_dev *pdev = box->pci_dev;
int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
resource_size_t addr;
@@ -430,7 +442,10 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
addr &= ~(PAGE_SIZE - 1);
- box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+ box->io_addr = ioremap(addr, type->mmio_map_size);
+ if (!box->io_addr)
+ pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+
box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
}
@@ -586,6 +601,7 @@ static struct intel_uncore_type snb_uncore_imc = {
.num_counters = 2,
.num_boxes = 1,
.num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = SNB_UNCORE_PCI_IMC_MAP_SIZE,
.freerunning = snb_uncore_imc_freerunning,
.event_descs = snb_uncore_imc_events,
.format_group = &snb_uncore_imc_format_group,
@@ -771,6 +787,50 @@ static const struct pci_device_id skl_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
{ /* end: all zeroes */ },
};
@@ -863,6 +923,17 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */
IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */
IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Dual Core */
+ IMC_DEV(CML_H1_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_H2_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_H3_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_U1_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_U2_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_U3_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S1_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S2_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S3_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S4_IMC, &skl_uncore_pci_driver),
+ IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver),
IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */
IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */
{ /* end marker */ }
@@ -1085,11 +1156,13 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void)
}
#define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000
+#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000
static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
{
struct pci_dev *pdev = tgl_uncore_get_mc_dev();
struct intel_uncore_pmu *pmu = box->pmu;
+ struct intel_uncore_type *type = pmu->type;
resource_size_t addr;
u32 mch_bar;
@@ -1112,7 +1185,9 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
addr |= ((resource_size_t)mch_bar << 32);
#endif
- box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+ box->io_addr = ioremap(addr, type->mmio_map_size);
+ if (!box->io_addr)
+ pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
}
static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = {
@@ -1138,6 +1213,7 @@ static struct intel_uncore_type tgl_uncore_imc_free_running = {
.num_counters = 3,
.num_boxes = 2,
.num_freerunning_types = TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = TGL_UNCORE_PCI_IMC_MAP_SIZE,
.freerunning = tgl_uncore_imc_freerunning,
.ops = &tgl_uncore_imc_freerunning_ops,
.event_descs = tgl_uncore_imc_events,
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 07652fa20ebb..62e88ad919ff 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -273,6 +273,30 @@
#define SKX_CPUNODEID 0xc0
#define SKX_GIDNIDMAP 0xd4
+/*
+ * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR
+ * that BIOS programmed. MSR has package scope.
+ * | Bit | Default | Description
+ * | [63] | 00h | VALID - When set, indicates the CPU bus
+ * numbers have been initialized. (RO)
+ * |[62:48]| --- | Reserved
+ * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned
+ * CPUBUSNO(5). (RO)
+ * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned
+ * CPUBUSNO(4). (RO)
+ * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned
+ * CPUBUSNO(3). (RO)
+ * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned
+ * CPUBUSNO(2). (RO)
+ * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned
+ * CPUBUSNO(1). (RO)
+ * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned
+ * CPUBUSNO(0). (RO)
+ */
+#define SKX_MSR_CPU_BUS_NUMBER 0x300
+#define SKX_MSR_CPU_BUS_VALID_BIT (1ULL << 63)
+#define BUS_NUM_STRIDE 8
+
/* SKX CHA */
#define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0)
#define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9)
@@ -3612,6 +3636,170 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
.read_counter = uncore_msr_read_counter,
};
+static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
+{
+ return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE);
+}
+
+static umode_t
+skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+
+ /* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */
+ return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode;
+}
+
+static ssize_t skx_iio_mapping_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct pci_bus *bus = pci_find_next_bus(NULL);
+ struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev);
+ struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
+ long die = (long)ea->var;
+
+ /*
+ * Current implementation is for single segment configuration hence it's
+ * safe to take the segment value from the first available root bus.
+ */
+ return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus),
+ skx_iio_stack(uncore_pmu, die));
+}
+
+static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
+{
+ u64 msr_value;
+
+ if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) ||
+ !(msr_value & SKX_MSR_CPU_BUS_VALID_BIT))
+ return -ENXIO;
+
+ *topology = msr_value;
+
+ return 0;
+}
+
+static int die_to_cpu(int die)
+{
+ int res = 0, cpu, current_die;
+ /*
+ * Using cpus_read_lock() to ensure cpu is not going down between
+ * looking at cpu_online_mask.
+ */
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ current_die = topology_logical_die_id(cpu);
+ if (current_die == die) {
+ res = cpu;
+ break;
+ }
+ }
+ cpus_read_unlock();
+ return res;
+}
+
+static int skx_iio_get_topology(struct intel_uncore_type *type)
+{
+ int i, ret;
+ struct pci_bus *bus = NULL;
+
+ /*
+ * Verified single-segment environments only; disabled for multiple
+ * segment topologies for now except VMD domains.
+ * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
+ */
+ while ((bus = pci_find_next_bus(bus))
+ && (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff))
+ ;
+ if (bus)
+ return -EPERM;
+
+ type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL);
+ if (!type->topology)
+ return -ENOMEM;
+
+ for (i = 0; i < uncore_max_dies(); i++) {
+ ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]);
+ if (ret) {
+ kfree(type->topology);
+ type->topology = NULL;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static struct attribute_group skx_iio_mapping_group = {
+ .is_visible = skx_iio_mapping_visible,
+};
+
+static const struct attribute_group *skx_iio_attr_update[] = {
+ &skx_iio_mapping_group,
+ NULL,
+};
+
+static int skx_iio_set_mapping(struct intel_uncore_type *type)
+{
+ char buf[64];
+ int ret;
+ long die = -1;
+ struct attribute **attrs = NULL;
+ struct dev_ext_attribute *eas = NULL;
+
+ ret = skx_iio_get_topology(type);
+ if (ret)
+ return ret;
+
+ /* One more for NULL. */
+ attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ goto err;
+
+ eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL);
+ if (!eas)
+ goto err;
+
+ for (die = 0; die < uncore_max_dies(); die++) {
+ sprintf(buf, "die%ld", die);
+ sysfs_attr_init(&eas[die].attr.attr);
+ eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL);
+ if (!eas[die].attr.attr.name)
+ goto err;
+ eas[die].attr.attr.mode = 0444;
+ eas[die].attr.show = skx_iio_mapping_show;
+ eas[die].attr.store = NULL;
+ eas[die].var = (void *)die;
+ attrs[die] = &eas[die].attr.attr;
+ }
+ skx_iio_mapping_group.attrs = attrs;
+
+ return 0;
+err:
+ for (; die >= 0; die--)
+ kfree(eas[die].attr.attr.name);
+ kfree(eas);
+ kfree(attrs);
+ kfree(type->topology);
+ type->attr_update = NULL;
+ return -ENOMEM;
+}
+
+static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
+{
+ struct attribute **attr = skx_iio_mapping_group.attrs;
+
+ if (!attr)
+ return;
+
+ for (; *attr; attr++)
+ kfree((*attr)->name);
+ kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs));
+ kfree(skx_iio_mapping_group.attrs);
+ skx_iio_mapping_group.attrs = NULL;
+ kfree(type->topology);
+}
+
static struct intel_uncore_type skx_uncore_iio = {
.name = "iio",
.num_counters = 4,
@@ -3626,6 +3814,9 @@ static struct intel_uncore_type skx_uncore_iio = {
.constraints = skx_uncore_iio_constraints,
.ops = &skx_uncore_iio_ops,
.format_group = &skx_uncore_iio_format_group,
+ .attr_update = skx_iio_attr_update,
+ .set_mapping = skx_iio_set_mapping,
+ .cleanup_mapping = skx_iio_cleanup_mapping,
};
enum perf_uncore_iio_freerunning_type_id {
@@ -4421,6 +4612,7 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
unsigned int box_ctl, int mem_offset)
{
struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid);
+ struct intel_uncore_type *type = box->pmu->type;
resource_size_t addr;
u32 pci_dword;
@@ -4435,9 +4627,11 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
addr += box_ctl;
- box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE);
- if (!box->io_addr)
+ box->io_addr = ioremap(addr, type->mmio_map_size);
+ if (!box->io_addr) {
+ pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
return;
+ }
writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr);
}
@@ -4480,6 +4674,9 @@ static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box,
if (!box->io_addr)
return;
+ if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+ return;
+
writel(hwc->config | SNBEP_PMON_CTL_EN,
box->io_addr + hwc->config_base);
}
@@ -4492,6 +4689,9 @@ static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box,
if (!box->io_addr)
return;
+ if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+ return;
+
writel(hwc->config, box->io_addr + hwc->config_base);
}
@@ -4530,6 +4730,7 @@ static struct intel_uncore_type snr_uncore_imc = {
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL,
.mmio_offset = SNR_IMC_MMIO_OFFSET,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
.ops = &snr_uncore_mmio_ops,
.format_group = &skx_uncore_format_group,
};
@@ -4570,6 +4771,7 @@ static struct intel_uncore_type snr_uncore_imc_free_running = {
.num_counters = 3,
.num_boxes = 1,
.num_freerunning_types = SNR_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
.freerunning = snr_imc_freerunning,
.ops = &snr_uncore_imc_freerunning_ops,
.event_descs = snr_uncore_imc_freerunning_events,
@@ -4987,6 +5189,7 @@ static struct intel_uncore_type icx_uncore_imc = {
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
.box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL,
.mmio_offset = SNR_IMC_MMIO_OFFSET,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
.ops = &icx_uncore_mmio_ops,
.format_group = &skx_uncore_format_group,
};
@@ -5044,6 +5247,7 @@ static struct intel_uncore_type icx_uncore_imc_free_running = {
.num_counters = 5,
.num_boxes = 4,
.num_freerunning_types = ICX_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = SNR_IMC_MMIO_SIZE,
.freerunning = icx_imc_freerunning,
.ops = &icx_uncore_imc_freerunning_ops,
.event_descs = icx_uncore_imc_freerunning_events,
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 0f63585edf5f..5c15f95b1ba7 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -144,7 +144,7 @@
_ASM_PTR (entry); \
.popsection
-#else
+#else /* ! __ASSEMBLY__ */
# define _EXPAND_EXTABLE_HANDLE(x) #x
# define _ASM_EXTABLE_HANDLE(from, to, handler) \
" .pushsection \"__ex_table\",\"a\"\n" \
@@ -164,9 +164,7 @@
_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
/* For C file, we already have NOKPROBE_SYMBOL macro */
-#endif
-#ifndef __ASSEMBLY__
/*
* This output constraint should be used for any inline asm which has a "call"
* instruction. Otherwise the asm may be inserted before the frame pointer
@@ -175,6 +173,6 @@
*/
register unsigned long current_stack_pointer asm(_ASM_SP);
#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
-#endif
+#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 35460fef39b8..0367efdc5b7a 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -201,12 +201,8 @@ arch_test_and_change_bit(long nr, volatile unsigned long *addr)
return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr);
}
-static __no_kcsan_or_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
+static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr)
{
- /*
- * Because this is a plain access, we need to disable KCSAN here to
- * avoid double instrumentation via instrumented bitops.
- */
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index fb34ff641e0a..028189575560 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -75,6 +75,12 @@ do { \
unreachable(); \
} while (0)
+/*
+ * This instrumentation_begin() is strictly speaking incorrect; but it
+ * suppresses the complaints from WARN()s in noinstr code. If such a WARN()
+ * were to trigger, we'd rather wreck the machine in an attempt to get the
+ * message out than not know about it.
+ */
#define __WARN_FLAGS(flags) \
do { \
instrumentation_begin(); \
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index dd17c2da1af5..da78ccbd493b 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -58,4 +58,9 @@ static inline bool handle_guest_split_lock(unsigned long ip)
return false;
}
#endif
+#ifdef CONFIG_IA32_FEAT_CTL
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
+#else
+static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {}
+#endif
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 02dabc9e77b0..c693ebf32a15 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -96,6 +96,7 @@
#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
+/* free ( 3*32+17) */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
@@ -107,6 +108,7 @@
#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
+/* free ( 3*32+29) */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
index 6722ffcef2e6..3afa990d756b 100644
--- a/arch/x86/include/asm/cpumask.h
+++ b/arch/x86/include/asm/cpumask.h
@@ -11,5 +11,23 @@ extern cpumask_var_t cpu_sibling_setup_mask;
extern void setup_cpu_local_masks(void);
+/*
+ * NMI and MCE exceptions need cpu_is_offline() _really_ early,
+ * provide an arch_ special for them to avoid instrumentation.
+ */
+#if NR_CPUS > 1
+static __always_inline bool arch_cpu_online(int cpu)
+{
+ return arch_test_bit(cpu, cpumask_bits(cpu_online_mask));
+}
+#else
+static __always_inline bool arch_cpu_online(int cpu)
+{
+ return cpu == 0;
+}
+#endif
+
+#define arch_cpu_is_offline(cpu) unlikely(!arch_cpu_online(cpu))
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_CPUMASK_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9b8cb50768c2..b8f1dc0761e4 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -74,16 +74,26 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
#else
# include <asm-generic/div64.h>
-static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+/*
+ * Will generate an #DE when the result doesn't fit u64, could fix with an
+ * __ex_table[] entry when it becomes an issue.
+ */
+static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
{
u64 q;
asm ("mulq %2; divq %3" : "=a" (q)
- : "a" (a), "rm" ((u64)mul), "rm" ((u64)div)
+ : "a" (a), "rm" (mul), "rm" (div)
: "rdx");
return q;
}
+#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
+
+static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
+{
+ return mul_u64_u64_div_u64(a, mul, div);
+}
#define mul_u64_u32_div mul_u64_u32_div
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
index bca4c743de77..aefd53767a5d 100644
--- a/arch/x86/include/asm/fsgsbase.h
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -19,36 +19,63 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task);
extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase);
extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase);
-/* Helper functions for reading/writing FS/GS base */
+/* Must be protected by X86_FEATURE_FSGSBASE check. */
-static inline unsigned long x86_fsbase_read_cpu(void)
+static __always_inline unsigned long rdfsbase(void)
{
unsigned long fsbase;
- rdmsrl(MSR_FS_BASE, fsbase);
+ asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory");
return fsbase;
}
-static inline unsigned long x86_gsbase_read_cpu_inactive(void)
+static __always_inline unsigned long rdgsbase(void)
{
unsigned long gsbase;
- rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory");
return gsbase;
}
-static inline void x86_fsbase_write_cpu(unsigned long fsbase)
+static __always_inline void wrfsbase(unsigned long fsbase)
{
- wrmsrl(MSR_FS_BASE, fsbase);
+ asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory");
}
-static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
+static __always_inline void wrgsbase(unsigned long gsbase)
{
- wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
}
+#include <asm/cpufeature.h>
+
+/* Helper functions for reading/writing FS/GS base */
+
+static inline unsigned long x86_fsbase_read_cpu(void)
+{
+ unsigned long fsbase;
+
+ if (static_cpu_has(X86_FEATURE_FSGSBASE))
+ fsbase = rdfsbase();
+ else
+ rdmsrl(MSR_FS_BASE, fsbase);
+
+ return fsbase;
+}
+
+static inline void x86_fsbase_write_cpu(unsigned long fsbase)
+{
+ if (static_cpu_has(X86_FEATURE_FSGSBASE))
+ wrfsbase(fsbase);
+ else
+ wrmsrl(MSR_FS_BASE, fsbase);
+}
+
+extern unsigned long x86_gsbase_read_cpu_inactive(void);
+extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
+
#endif /* CONFIG_X86_64 */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index f5a796da07f8..d063841a17e3 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -306,6 +306,21 @@
.endif
MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
.endm
+
+.macro RDPID opd
+ REG_TYPE rdpid_opd_type \opd
+ .if rdpid_opd_type == REG_TYPE_R64
+ R64_NUM rdpid_opd \opd
+ .else
+ R32_NUM rdpid_opd \opd
+ .endif
+ .byte 0xf3
+ .if rdpid_opd > 7
+ PFX_REX rdpid_opd 0
+ .endif
+ .byte 0x0f, 0xc7
+ MODRM 0xc0 rdpid_opd 0x7
+.endm
#endif
#endif
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 073eb7ad2f56..143bc9abe99c 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -66,6 +66,8 @@ struct arch_specific_insn {
*/
bool boostable;
bool if_modifier;
+ /* Number of bytes of text poked */
+ int tp_len;
};
struct arch_optimized_insn {
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 848ce43b9040..5049f6c22683 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -43,9 +43,10 @@ void __init sme_enable(struct boot_params *bp);
int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
+void __init mem_encrypt_free_decrypted_mem(void);
+
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void);
-void __init mem_encrypt_free_decrypted_mem(void);
bool sme_active(void);
bool sev_active(void);
@@ -77,6 +78,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0;
static inline int __init
early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+static inline void mem_encrypt_free_decrypted_mem(void) { }
+
#define __bss_decrypted
#endif /* CONFIG_AMD_MEM_ENCRYPT */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e8370e64a155..63ed8fe35738 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -418,7 +418,6 @@
#define MSR_AMD64_PATCH_LEVEL 0x0000008b
#define MSR_AMD64_TSC_RATIO 0xc0000104
#define MSR_AMD64_NB_CFG 0xc001001f
-#define MSR_AMD64_CPUID_FN_1 0xc0011004
#define MSR_AMD64_PATCH_LOADER 0xc0010020
#define MSR_AMD_PERF_CTL 0xc0010062
#define MSR_AMD_PERF_STATUS 0xc0010063
@@ -427,6 +426,7 @@
#define MSR_AMD64_OSVW_STATUS 0xc0010141
#define MSR_AMD_PPIN_CTL 0xc00102f0
#define MSR_AMD_PPIN 0xc00102f1
+#define MSR_AMD64_CPUID_FN_1 0xc0011004
#define MSR_AMD64_LS_CFG 0xc0011020
#define MSR_AMD64_DC_CFG 0xc0011022
#define MSR_AMD64_BU_CFG2 0xc001102a
@@ -466,6 +466,8 @@
#define MSR_F16H_DR0_ADDR_MASK 0xc0011027
/* Fam 15h MSRs */
+#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a
+#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b
#define MSR_F15H_PERF_CTL 0xc0010200
#define MSR_F15H_PERF_CTL0 MSR_F15H_PERF_CTL
#define MSR_F15H_PERF_CTL1 (MSR_F15H_PERF_CTL + 2)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 76aa21e8128d..b836138ce852 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -999,15 +999,12 @@ extern int direct_gbpages;
void init_mem_mapping(void);
void early_alloc_pgt_buf(void);
extern void memblock_find_dma_reserve(void);
-
-
-#ifdef CONFIG_X86_64
-extern pgd_t trampoline_pgd_entry;
-
void __init poking_init(void);
-
unsigned long init_memory_mapping(unsigned long start,
unsigned long end, pgprot_t prot);
+
+#ifdef CONFIG_X86_64
+extern pgd_t trampoline_pgd_entry;
#endif
/* local pte updates need not use xchg for locking */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 42cd333616c4..7c2ecdf7e064 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -457,10 +457,8 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu)
DECLARE_PER_CPU(unsigned int, irq_count);
extern asmlinkage void ignore_sysret(void);
-#if IS_ENABLED(CONFIG_KVM)
/* Save actual FS/GS selectors and bases to current->thread */
-void save_fsgs_for_kvm(void);
-#endif
+void current_save_fsgs(void);
#else /* X86_64 */
#ifdef CONFIG_STACKPROTECTOR
/*
@@ -575,7 +573,7 @@ native_load_sp0(unsigned long sp0)
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
-static inline void native_swapgs(void)
+static __always_inline void native_swapgs(void)
{
#ifdef CONFIG_X86_64
asm volatile("swapgs" ::: "memory");
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 9804a7957f4e..7fb482f0f25b 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -90,6 +90,15 @@ static __always_inline void boot_init_stack_canary(void)
#endif
}
+static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle)
+{
+#ifdef CONFIG_X86_64
+ per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary;
+#else
+ per_cpu(stack_canary.canary, cpu) = idle->stack_canary;
+#endif
+}
+
static inline void setup_stack_canary_segment(int cpu)
{
#ifdef CONFIG_X86_32
@@ -119,6 +128,9 @@ static inline void load_stack_canary_segment(void)
static inline void setup_stack_canary_segment(int cpu)
{ }
+static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle)
+{ }
+
static inline void load_stack_canary_segment(void)
{
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 79d8d5496330..f4234575f3fd 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -193,7 +193,7 @@ static inline void sched_clear_itmt_support(void)
}
#endif /* CONFIG_SCHED_MC_PRIO */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_64)
#include <asm/cpufeature.h>
DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key);
diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h
index 8b2effe6efb8..5fdfcb47000f 100644
--- a/arch/x86/include/uapi/asm/hwcap2.h
+++ b/arch/x86/include/uapi/asm/hwcap2.h
@@ -5,4 +5,7 @@
/* MONITOR/MWAIT enabled in Ring 3 */
#define HWCAP2_RING3MWAIT (1 << 0)
+/* Kernel allows FSGSBASE instructions available in Ring 3 */
+#define HWCAP2_FSGSBASE BIT(1)
+
#endif
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8fd39ff74a49..20e07feb4064 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -3,6 +3,7 @@
#include <linux/module.h>
#include <linux/sched.h>
+#include <linux/perf_event.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
@@ -53,7 +54,7 @@ __setup("noreplace-smp", setup_noreplace_smp);
#define DPRINTK(fmt, args...) \
do { \
if (debug_alternative) \
- printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
+ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \
} while (0)
#define DUMP_BYTES(buf, len, fmt, args...) \
@@ -64,7 +65,7 @@ do { \
if (!(len)) \
break; \
\
- printk(KERN_DEBUG fmt, ##args); \
+ printk(KERN_DEBUG pr_fmt(fmt), ##args); \
for (j = 0; j < (len) - 1; j++) \
printk(KERN_CONT "%02hhx ", buf[j]); \
printk(KERN_CONT "%02hhx\n", buf[j]); \
@@ -1001,6 +1002,7 @@ struct text_poke_loc {
s32 rel32;
u8 opcode;
const u8 text[POKE_MAX_OPCODE_SIZE];
+ u8 old;
};
struct bp_patching_desc {
@@ -1168,8 +1170,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
/*
* First step: add a int3 trap to the address that will be patched.
*/
- for (i = 0; i < nr_entries; i++)
+ for (i = 0; i < nr_entries; i++) {
+ tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
+ }
text_poke_sync();
@@ -1177,14 +1181,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
* Second step: update all but the first byte of the patched range.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
+ u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
int len = text_opcode_size(tp[i].opcode);
if (len - INT3_INSN_SIZE > 0) {
+ memcpy(old + INT3_INSN_SIZE,
+ text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
(const char *)tp[i].text + INT3_INSN_SIZE,
len - INT3_INSN_SIZE);
do_sync++;
}
+
+ /*
+ * Emit a perf event to record the text poke, primarily to
+ * support Intel PT decoding which must walk the executable code
+ * to reconstruct the trace. The flow up to here is:
+ * - write INT3 byte
+ * - IPI-SYNC
+ * - write instruction tail
+ * At this point the actual control flow will be through the
+ * INT3 and handler and not hit the old or new instruction.
+ * Intel PT outputs FUP/TIP packets for the INT3, so the flow
+ * can still be decoded. Subsequently:
+ * - emit RECORD_TEXT_POKE with the new instruction
+ * - IPI-SYNC
+ * - write first byte
+ * - IPI-SYNC
+ * So before the text poke event timestamp, the decoder will see
+ * either the old instruction flow or FUP/TIP of INT3. After the
+ * text poke event timestamp, the decoder will see either the
+ * new instruction flow or FUP/TIP of INT3. Thus decoders can
+ * use the timestamp as the point at which to modify the
+ * executable code.
+ * The old instruction is recorded so that the event can be
+ * processed forwards or backwards.
+ */
+ perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
+ tp[i].text, len);
}
if (do_sync) {
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0b71970d2d3d..f0b743a2fe9c 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -543,14 +543,12 @@ static void __init spectre_v1_select_mitigation(void)
* If FSGSBASE is enabled, the user can put a kernel address in
* GS, in which case SMAP provides no protection.
*
- * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the
- * FSGSBASE enablement patches have been merged. ]
- *
* If FSGSBASE is disabled, the user can only put a user space
* address in GS. That makes an attack harder, but still
* possible if there's no SMAP protection.
*/
- if (!smap_works_speculatively()) {
+ if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
+ !smap_works_speculatively()) {
/*
* Mitigation can be provided from SWAPGS itself or
* PTI as the CR3 write in the Meltdown mitigation
@@ -763,10 +761,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
}
/*
- * If enhanced IBRS is enabled or SMT impossible, STIBP is not
+ * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not
* required.
*/
- if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
+ if (!boot_cpu_has(X86_FEATURE_STIBP) ||
+ !smt_possible ||
+ spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
return;
/*
@@ -778,12 +778,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
mode = SPECTRE_V2_USER_STRICT_PREFERRED;
- /*
- * If STIBP is not available, clear the STIBP mode.
- */
- if (!boot_cpu_has(X86_FEATURE_STIBP))
- mode = SPECTRE_V2_USER_NONE;
-
spectre_v2_user_stibp = mode;
set_mode:
@@ -1270,7 +1264,6 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
* Indirect branch speculation is always disabled in strict
* mode. It can neither be enabled if it was force-disabled
* by a previous prctl call.
-
*/
if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT ||
spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT ||
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 426792565d86..c5cf336e5077 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -3,6 +3,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include <asm/e820/api.h>
#include <asm/mtrr.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 043d93cdcaad..965474d78cef 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -347,6 +347,9 @@ out:
cr4_clear_bits(X86_CR4_UMIP);
}
+/* These bits should not change their value after CPU init is finished. */
+static const unsigned long cr4_pinned_mask =
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE;
static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
static unsigned long cr4_pinned_bits __ro_after_init;
@@ -371,20 +374,20 @@ EXPORT_SYMBOL(native_write_cr0);
void native_write_cr4(unsigned long val)
{
- unsigned long bits_missing = 0;
+ unsigned long bits_changed = 0;
set_register:
asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits));
if (static_branch_likely(&cr_pinning)) {
- if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) {
- bits_missing = ~val & cr4_pinned_bits;
- val |= bits_missing;
+ if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) {
+ bits_changed = (val & cr4_pinned_mask) ^ cr4_pinned_bits;
+ val = (val & ~cr4_pinned_mask) | cr4_pinned_bits;
goto set_register;
}
- /* Warn after we've set the missing bits. */
- WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n",
- bits_missing);
+ /* Warn after we've corrected the changed bits. */
+ WARN_ONCE(bits_changed, "pinned CR4 bits changed: 0x%lx!?\n",
+ bits_changed);
}
}
#if IS_MODULE(CONFIG_LKDTM)
@@ -419,7 +422,7 @@ void cr4_init(void)
if (boot_cpu_has(X86_FEATURE_PCID))
cr4 |= X86_CR4_PCIDE;
if (static_branch_likely(&cr_pinning))
- cr4 |= cr4_pinned_bits;
+ cr4 = (cr4 & ~cr4_pinned_mask) | cr4_pinned_bits;
__write_cr4(cr4);
@@ -434,13 +437,26 @@ void cr4_init(void)
*/
static void __init setup_cr_pinning(void)
{
- unsigned long mask;
-
- mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP);
- cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask;
+ cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & cr4_pinned_mask;
static_key_enable(&cr_pinning.key);
}
+static __init int x86_nofsgsbase_setup(char *arg)
+{
+ /* Require an exact match without trailing characters. */
+ if (strlen(arg))
+ return 0;
+
+ /* Do not emit a message if the feature is not present. */
+ if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
+ pr_info("FSGSBASE disabled via kernel command line\n");
+ return 1;
+}
+__setup("nofsgsbase", x86_nofsgsbase_setup);
+
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -1495,6 +1511,12 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smap(c);
setup_umip(c);
+ /* Enable FSGSBASE instructions if available. */
+ if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
+ cr4_set_bits(X86_CR4_FSGSBASE);
+ elf_hwcap2 |= HWCAP2_FSGSBASE;
+ }
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index fb538fccd24c..9d033693519a 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -81,8 +81,4 @@ extern void update_srbds_msr(void);
extern u64 x86_read_arch_cap_msr(void);
-#ifdef CONFIG_IA32_FEAT_CTL
-void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
-#endif
-
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index ce9120c4f740..fbe89a92ff36 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1083,7 +1083,7 @@ static noinstr bool mce_check_crashing_cpu(void)
{
unsigned int cpu = smp_processor_id();
- if (cpu_is_offline(cpu) ||
+ if (arch_cpu_is_offline(cpu) ||
(crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index 43c466020ed5..03e51053592a 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -345,7 +345,7 @@ static __init int dev_mcelog_init_device(void)
int err;
mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus());
- mcelog = kzalloc(sizeof(*mcelog) + mce_log_len * sizeof(struct mce), GFP_KERNEL);
+ mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL);
if (!mcelog)
return -ENOMEM;
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 0593b192eb8f..7843ab3fde09 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -511,7 +511,7 @@ static void do_inject(void)
*/
if (inj_type == DFR_INT_INJ) {
i_mce.status |= MCI_STATUS_DEFERRED;
- i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
+ i_mce.status &= ~MCI_STATUS_UC;
}
/*
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index baec68b7e010..ec6f0415bc6d 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -145,7 +145,6 @@ extern struct builtin_fw __end_builtin_fw[];
bool get_builtin_firmware(struct cpio_data *cd, const char *name)
{
-#ifdef CONFIG_FW_LOADER
struct builtin_fw *b_fw;
for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
@@ -155,7 +154,6 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
return true;
}
}
-#endif
return false;
}
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 12f967c6b603..6a9df71c1b9e 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -981,10 +981,10 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c)
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
- if (c->x86_vendor == X86_VENDOR_INTEL)
- c->x86_cache_mbm_width_offset = eax & 0xff;
- else
- c->x86_cache_mbm_width_offset = -1;
+ c->x86_cache_mbm_width_offset = eax & 0xff;
+
+ if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset)
+ c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD;
}
}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index f20a47d120b1..5ffa32256b3b 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -37,6 +37,7 @@
#define MBA_IS_LINEAR 0x4
#define MBA_MAX_MBPS U32_MAX
#define MAX_MBA_BW_AMD 0x800
+#define MBM_CNTR_WIDTH_OFFSET_AMD 20
#define RMID_VAL_ERROR BIT_ULL(63)
#define RMID_VAL_UNAVAIL BIT_ULL(62)
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 23b4b61319d3..3f844f14fc0a 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1117,6 +1117,7 @@ static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
_d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) {
_r_cdp = NULL;
+ _d_cdp = NULL;
ret = -EINVAL;
}
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index df1358ba622b..05fa4ef63490 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -2,6 +2,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include "cpu.h"
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 0db21206f2f3..7ecf9babf0cb 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -160,7 +160,7 @@ static const __initconst struct idt_data apic_idts[] = {
/* Must be page-aligned because the real IDT is used in the cpu entry area */
static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
-struct desc_ptr idt_descr __ro_after_init = {
+static struct desc_ptr idt_descr __ro_after_init = {
.size = IDT_TABLE_SIZE - 1,
.address = (unsigned long) idt_table,
};
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index ada39ddbc922..fdadc37d72af 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -33,6 +33,7 @@
#include <linux/hardirq.h>
#include <linux/preempt.h>
#include <linux/sched/debug.h>
+#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
@@ -472,6 +473,9 @@ static int arch_copy_kprobe(struct kprobe *p)
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
+ p->ainsn.tp_len = len;
+ perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
+
/* OK, write back the instruction(s) into ROX insn buffer */
text_poke(p->ainsn.insn, buf, len);
@@ -503,12 +507,18 @@ int arch_prepare_kprobe(struct kprobe *p)
void arch_arm_kprobe(struct kprobe *p)
{
- text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1);
+ u8 int3 = INT3_INSN_OPCODE;
+
+ text_poke(p->addr, &int3, 1);
text_poke_sync();
+ perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
}
void arch_disarm_kprobe(struct kprobe *p)
{
+ u8 int3 = INT3_INSN_OPCODE;
+
+ perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
text_poke(p->addr, &p->opcode, 1);
text_poke_sync();
}
@@ -516,6 +526,9 @@ void arch_disarm_kprobe(struct kprobe *p)
void arch_remove_kprobe(struct kprobe *p)
{
if (p->ainsn.insn) {
+ /* Record the perf event before freeing the slot */
+ perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
+ p->ainsn.tp_len, NULL, 0);
free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
p->ainsn.insn = NULL;
}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7af4c61dde52..40f380461e6d 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -6,6 +6,7 @@
* Copyright (C) Hitachi Ltd., 2012
*/
#include <linux/kprobes.h>
+#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/string.h>
#include <linux/slab.h>
@@ -352,8 +353,15 @@ int arch_within_optimized_kprobe(struct optimized_kprobe *op,
static
void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
{
- if (op->optinsn.insn) {
- free_optinsn_slot(op->optinsn.insn, dirty);
+ u8 *slot = op->optinsn.insn;
+ if (slot) {
+ int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE;
+
+ /* Record the perf event before freeing the slot */
+ if (dirty)
+ perf_event_text_poke(slot, slot, len, NULL, 0);
+
+ free_optinsn_slot(slot, dirty);
op->optinsn.insn = NULL;
op->optinsn.size = 0;
}
@@ -424,8 +432,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
(u8 *)op->kp.addr + op->optinsn.size);
len += JMP32_INSN_SIZE;
+ /*
+ * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also
+ * used in __arch_remove_optimized_kprobe().
+ */
+
/* We have to use text_poke() for instruction buffer because it is RO */
+ perf_event_text_poke(slot, NULL, 0, buf, len);
text_poke(slot, buf, len);
+
ret = 0;
out:
kfree(buf);
@@ -477,10 +492,23 @@ void arch_optimize_kprobes(struct list_head *oplist)
*/
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
- arch_arm_kprobe(&op->kp);
- text_poke(op->kp.addr + INT3_INSN_SIZE,
- op->optinsn.copied_insn, DISP32_SIZE);
+ u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, };
+ u8 old[JMP32_INSN_SIZE];
+ u8 *addr = op->kp.addr;
+
+ memcpy(old, op->kp.addr, JMP32_INSN_SIZE);
+ memcpy(new + INT3_INSN_SIZE,
+ op->optinsn.copied_insn,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+
+ text_poke(addr, new, INT3_INSN_SIZE);
text_poke_sync();
+ text_poke(addr + INT3_INSN_SIZE,
+ new + INT3_INSN_SIZE,
+ JMP32_INSN_SIZE - INT3_INSN_SIZE);
+ text_poke_sync();
+
+ perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
}
/*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 1547be359d7f..576c43e39247 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -42,6 +42,14 @@
static struct class *msr_class;
static enum cpuhp_state cpuhp_msr_state;
+enum allow_write_msrs {
+ MSR_WRITES_ON,
+ MSR_WRITES_OFF,
+ MSR_WRITES_DEFAULT,
+};
+
+static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;
+
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -70,6 +78,24 @@ static ssize_t msr_read(struct file *file, char __user *buf,
return bytes ? bytes : err;
}
+static int filter_write(u32 reg)
+{
+ switch (allow_writes) {
+ case MSR_WRITES_ON: return 0; break;
+ case MSR_WRITES_OFF: return -EPERM; break;
+ default: break;
+ }
+
+ if (reg == MSR_IA32_ENERGY_PERF_BIAS)
+ return 0;
+
+ pr_err_ratelimited("Write to unrecognized MSR 0x%x by %s\n"
+ "Please report to x86@kernel.org\n",
+ reg, current->comm);
+
+ return 0;
+}
+
static ssize_t msr_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -84,6 +110,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
if (err)
return err;
+ err = filter_write(reg);
+ if (err)
+ return err;
+
if (count % 8)
return -EINVAL; /* Invalid chunk size */
@@ -92,9 +122,13 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
err = -EFAULT;
break;
}
+
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
if (err)
break;
+
tmp += 2;
bytes += 8;
}
@@ -242,6 +276,41 @@ static void __exit msr_exit(void)
}
module_exit(msr_exit)
+static int set_allow_writes(const char *val, const struct kernel_param *cp)
+{
+ /* val is NUL-terminated, see kernfs_fop_write() */
+ char *s = strstrip((char *)val);
+
+ if (!strcmp(s, "on"))
+ allow_writes = MSR_WRITES_ON;
+ else if (!strcmp(s, "off"))
+ allow_writes = MSR_WRITES_OFF;
+ else
+ allow_writes = MSR_WRITES_DEFAULT;
+
+ return 0;
+}
+
+static int get_allow_writes(char *buf, const struct kernel_param *kp)
+{
+ const char *res;
+
+ switch (allow_writes) {
+ case MSR_WRITES_ON: res = "on"; break;
+ case MSR_WRITES_OFF: res = "off"; break;
+ default: res = "default"; break;
+ }
+
+ return sprintf(buf, "%s\n", res);
+}
+
+static const struct kernel_param_ops allow_writes_ops = {
+ .set = set_allow_writes,
+ .get = get_allow_writes
+};
+
+module_param_cb(allow_writes, &allow_writes_ops, NULL, 0600);
+
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
MODULE_DESCRIPTION("x86 generic MSR driver");
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 2de365f15684..d7c5e44b26f7 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -478,7 +478,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
DEFINE_IDTENTRY_RAW(exc_nmi)
{
- if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id()))
+ if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
return;
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f362ce0d5ac0..216c88df1919 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -140,10 +140,12 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
#ifdef CONFIG_X86_64
- savesegment(gs, p->thread.gsindex);
- p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase;
- savesegment(fs, p->thread.fsindex);
- p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase;
+ current_save_fsgs();
+ p->thread.fsindex = current->thread.fsindex;
+ p->thread.fsbase = current->thread.fsbase;
+ p->thread.gsindex = current->thread.gsindex;
+ p->thread.gsbase = current->thread.gsbase;
+
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
#else
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9a97415b2139..d618969cea66 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -150,6 +150,44 @@ enum which_selector {
};
/*
+ * Out of line to be protected from kprobes and tracing. If this would be
+ * traced or probed than any access to a per CPU variable happens with
+ * the wrong GS.
+ *
+ * It is not used on Xen paravirt. When paravirt support is needed, it
+ * needs to be renamed with native_ prefix.
+ */
+static noinstr unsigned long __rdgsbase_inactive(void)
+{
+ unsigned long gsbase;
+
+ lockdep_assert_irqs_disabled();
+
+ native_swapgs();
+ gsbase = rdgsbase();
+ native_swapgs();
+
+ return gsbase;
+}
+
+/*
+ * Out of line to be protected from kprobes and tracing. If this would be
+ * traced or probed than any access to a per CPU variable happens with
+ * the wrong GS.
+ *
+ * It is not used on Xen paravirt. When paravirt support is needed, it
+ * needs to be renamed with native_ prefix.
+ */
+static noinstr void __wrgsbase_inactive(unsigned long gsbase)
+{
+ lockdep_assert_irqs_disabled();
+
+ native_swapgs();
+ wrgsbase(gsbase);
+ native_swapgs();
+}
+
+/*
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
* It's forcibly inlined because it'll generate better code and this function
@@ -198,22 +236,35 @@ static __always_inline void save_fsgs(struct task_struct *task)
{
savesegment(fs, task->thread.fsindex);
savesegment(gs, task->thread.gsindex);
- save_base_legacy(task, task->thread.fsindex, FS);
- save_base_legacy(task, task->thread.gsindex, GS);
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ /*
+ * If FSGSBASE is enabled, we can't make any useful guesses
+ * about the base, and user code expects us to save the current
+ * value. Fortunately, reading the base directly is efficient.
+ */
+ task->thread.fsbase = rdfsbase();
+ task->thread.gsbase = __rdgsbase_inactive();
+ } else {
+ save_base_legacy(task, task->thread.fsindex, FS);
+ save_base_legacy(task, task->thread.gsindex, GS);
+ }
}
-#if IS_ENABLED(CONFIG_KVM)
/*
* While a process is running,current->thread.fsbase and current->thread.gsbase
- * may not match the corresponding CPU registers (see save_base_legacy()). KVM
- * wants an efficient way to save and restore FSBASE and GSBASE.
- * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
+ * may not match the corresponding CPU registers (see save_base_legacy()).
*/
-void save_fsgs_for_kvm(void)
+void current_save_fsgs(void)
{
+ unsigned long flags;
+
+ /* Interrupts need to be off for FSGSBASE */
+ local_irq_save(flags);
save_fsgs(current);
+ local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
+#if IS_ENABLED(CONFIG_KVM)
+EXPORT_SYMBOL_GPL(current_save_fsgs);
#endif
static __always_inline void loadseg(enum which_selector which,
@@ -278,10 +329,22 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
struct thread_struct *next)
{
- load_seg_legacy(prev->fsindex, prev->fsbase,
- next->fsindex, next->fsbase, FS);
- load_seg_legacy(prev->gsindex, prev->gsbase,
- next->gsindex, next->gsbase, GS);
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ /* Update the FS and GS selectors if they could have changed. */
+ if (unlikely(prev->fsindex || next->fsindex))
+ loadseg(FS, next->fsindex);
+ if (unlikely(prev->gsindex || next->gsindex))
+ loadseg(GS, next->gsindex);
+
+ /* Update the bases. */
+ wrfsbase(next->fsbase);
+ __wrgsbase_inactive(next->gsbase);
+ } else {
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
+ }
}
static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
@@ -327,13 +390,44 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
return base;
}
+unsigned long x86_gsbase_read_cpu_inactive(void)
+{
+ unsigned long gsbase;
+
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ gsbase = __rdgsbase_inactive();
+ local_irq_restore(flags);
+ } else {
+ rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ }
+
+ return gsbase;
+}
+
+void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
+{
+ if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __wrgsbase_inactive(gsbase);
+ local_irq_restore(flags);
+ } else {
+ wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+ }
+}
+
unsigned long x86_fsbase_read_task(struct task_struct *task)
{
unsigned long fsbase;
if (task == current)
fsbase = x86_fsbase_read_cpu();
- else if (task->thread.fsindex == 0)
+ else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
+ (task->thread.fsindex == 0))
fsbase = task->thread.fsbase;
else
fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
@@ -347,7 +441,8 @@ unsigned long x86_gsbase_read_task(struct task_struct *task)
if (task == current)
gsbase = x86_gsbase_read_cpu_inactive();
- else if (task->thread.gsindex == 0)
+ else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
+ (task->thread.gsindex == 0))
gsbase = task->thread.gsbase;
else
gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 44130588987f..1c7646c8d0fe 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -379,25 +379,12 @@ static int putreg(struct task_struct *child,
case offsetof(struct user_regs_struct,fs_base):
if (value >= TASK_SIZE_MAX)
return -EIO;
- /*
- * When changing the FS base, use do_arch_prctl_64()
- * to set the index to zero and to set the base
- * as requested.
- *
- * NB: This behavior is nonsensical and likely needs to
- * change when FSGSBASE support is added.
- */
- if (child->thread.fsbase != value)
- return do_arch_prctl_64(child, ARCH_SET_FS, value);
+ x86_fsbase_write_task(child, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
- /*
- * Exactly the same here as the %fs handling above.
- */
if (value >= TASK_SIZE_MAX)
return -EIO;
- if (child->thread.gsbase != value)
- return do_arch_prctl_64(child, ARCH_SET_GS, value);
+ x86_gsbase_write_task(child, value);
return 0;
#endif
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ffbd9a3d78d8..27aa04a95702 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -51,11 +51,11 @@
#include <linux/err.h>
#include <linux/nmi.h>
#include <linux/tboot.h>
-#include <linux/stackprotector.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
+#include <linux/overflow.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -80,6 +80,7 @@
#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
#include <asm/hw_irq.h>
+#include <asm/stackprotector.h>
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
@@ -259,21 +260,10 @@ static void notrace start_secondary(void *unused)
/* enable local interrupts */
local_irq_enable();
- /* to prevent fake stack check failure in clock setup */
- boot_init_stack_canary();
-
x86_cpuinit.setup_percpu_clockev();
wmb();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
-
- /*
- * Prevent tail call to cpu_startup_entry() because the stack protector
- * guard has been changed a couple of function calls up, in
- * boot_init_stack_canary() and must not be checked before tail calling
- * another function.
- */
- prevent_tail_call_optimization();
}
/**
@@ -1011,6 +1001,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
alternatives_enable_smp();
per_cpu(current_task, cpu) = idle;
+ cpu_init_stack_canary(cpu, idle);
/* Initialize the interrupt stack(s) */
ret = irq_init_percpu_irqstack(cpu);
@@ -1777,6 +1768,7 @@ void native_play_dead(void)
#endif
+#ifdef CONFIG_X86_64
/*
* APERF/MPERF frequency ratio computation.
*
@@ -1975,6 +1967,7 @@ static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
static bool intel_set_max_freq_ratio(void)
{
u64 base_freq, turbo_freq;
+ u64 turbo_ratio;
if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
goto out;
@@ -2000,15 +1993,23 @@ out:
/*
* Some hypervisors advertise X86_FEATURE_APERFMPERF
* but then fill all MSR's with zeroes.
+ * Some CPUs have turbo boost but don't declare any turbo ratio
+ * in MSR_TURBO_RATIO_LIMIT.
*/
- if (!base_freq) {
- pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n");
+ if (!base_freq || !turbo_freq) {
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
return false;
}
- arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
- base_freq);
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
+ if (!turbo_ratio) {
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = turbo_ratio;
arch_set_max_freq_ratio(turbo_disabled());
+
return true;
}
@@ -2048,11 +2049,19 @@ static void init_freq_invariance(bool secondary)
}
}
+static void disable_freq_invariance_workfn(struct work_struct *work)
+{
+ static_branch_disable(&arch_scale_freq_key);
+}
+
+static DECLARE_WORK(disable_freq_invariance_work,
+ disable_freq_invariance_workfn);
+
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
void arch_scale_freq_tick(void)
{
- u64 freq_scale;
+ u64 freq_scale = SCHED_CAPACITY_SCALE;
u64 aperf, mperf;
u64 acnt, mcnt;
@@ -2064,19 +2073,32 @@ void arch_scale_freq_tick(void)
acnt = aperf - this_cpu_read(arch_prev_aperf);
mcnt = mperf - this_cpu_read(arch_prev_mperf);
- if (!mcnt)
- return;
this_cpu_write(arch_prev_aperf, aperf);
this_cpu_write(arch_prev_mperf, mperf);
- acnt <<= 2*SCHED_CAPACITY_SHIFT;
- mcnt *= arch_max_freq_ratio;
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
+ goto error;
+
+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ goto error;
freq_scale = div64_u64(acnt, mcnt);
+ if (!freq_scale)
+ goto error;
if (freq_scale > SCHED_CAPACITY_SCALE)
freq_scale = SCHED_CAPACITY_SCALE;
this_cpu_write(arch_freq_scale, freq_scale);
+ return;
+
+error:
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
+ schedule_work(&disable_freq_invariance_work);
+}
+#else
+static inline void init_freq_invariance(bool secondary)
+{
}
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index cb22f33bf1d8..a58453f7058e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1172,7 +1172,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
gs_base = cpu_kernelmode_gs_base(cpu);
if (likely(is_64bit_mm(current->mm))) {
- save_fsgs_for_kvm();
+ current_save_fsgs();
fs_sel = current->thread.fsindex;
gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index fff28c6f73a2..b0dfac3d3df7 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -24,6 +24,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
asm volatile(
" testq %[size8],%[size8]\n"
" jz 4f\n"
+ " .align 16\n"
"0: movq $0,(%[dst])\n"
" addq $8,%[dst]\n"
" decl %%ecx ; jnz 0b\n"
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 001dd7dc829f..c7a47603537f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -25,6 +25,7 @@
#include <asm/cpufeature.h>
#include <asm/pti.h>
#include <asm/text-patching.h>
+#include <asm/memtype.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -912,8 +913,6 @@ void free_kernel_image_pages(const char *what, void *begin, void *end)
set_memory_np_noalias(begin_ul, len_pages);
}
-void __weak mem_encrypt_free_decrypted_mem(void) { }
-
void __ref free_initmem(void)
{
e820__reallocate_tables();
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 4a781cf99e92..9f1177edc2e7 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -376,7 +376,6 @@ bool force_dma_unencrypted(struct device *dev)
return false;
}
-/* Architecture __weak replacement functions */
void __init mem_encrypt_free_decrypted_mem(void)
{
unsigned long vaddr, vaddr_end, npages;
@@ -401,6 +400,7 @@ void __init mem_encrypt_free_decrypted_mem(void)
free_init_pages("unused decrypted", vaddr, vaddr_end);
}
+/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void)
{
if (!sme_me_mask)
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 7c65102debaf..db1378c6ff26 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -193,6 +193,8 @@ static void fix_processor_context(void)
*/
static void notrace __restore_processor_state(struct saved_context *ctxt)
{
+ struct cpuinfo_x86 *c;
+
if (ctxt->misc_enable_saved)
wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable);
/*
@@ -263,6 +265,10 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
mtrr_bp_restore();
perf_restore_debug_store();
msr_restore_context(ctxt);
+
+ c = &cpu_data(smp_processor_id());
+ if (cpu_has(c, X86_FEATURE_MSR_IA32_FEAT_CTL))
+ init_ia32_feat_ctl(c);
}
/* Needed by apm.c */
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 171aff1b11f2..9ea598dcc132 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -92,9 +92,7 @@ static void cpu_bringup(void)
asmlinkage __visible void cpu_bringup_and_idle(void)
{
cpu_bringup();
- boot_init_stack_canary();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
- prevent_tail_call_optimization();
}
void xen_smp_intr_free_pv(unsigned int cpu)
diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c
index 3655d9d3f5df..198a8eb1cd56 100644
--- a/crypto/crypto_engine.c
+++ b/crypto/crypto_engine.c
@@ -482,7 +482,6 @@ struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
int (*cbk_do_batch)(struct crypto_engine *engine),
bool rt, int qlen)
{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO / 2 };
struct crypto_engine *engine;
if (!dev)
@@ -520,7 +519,7 @@ struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev,
if (engine->rt) {
dev_info(dev, "will run requests pump with realtime priority\n");
- sched_setscheduler(engine->kworker->task, SCHED_FIFO, &param);
+ sched_set_fifo(engine->kworker->task);
}
return engine;
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index e7dc0133f817..0ad9f2059e6d 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -136,12 +136,11 @@ static unsigned int idle_pct = 5; /* percentage */
static unsigned int round_robin_time = 1; /* second */
static int power_saving_thread(void *data)
{
- struct sched_param param = {.sched_priority = 1};
int do_sleep;
unsigned int tsk_index = (unsigned long)data;
u64 last_jiffies = 0;
- sched_setscheduler(current, SCHED_RR, &param);
+ sched_set_fifo_low(current);
while (!kthread_should_stop()) {
unsigned long expire_time;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 3a3f2b6a821f..280615efef74 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -6019,11 +6019,8 @@ int drbd_ack_receiver(struct drbd_thread *thi)
unsigned int header_size = drbd_header_size(connection);
int expect = header_size;
bool ping_timeout_active = false;
- struct sched_param param = { .sched_priority = 2 };
- rv = sched_setscheduler(current, SCHED_RR, &param);
- if (rv < 0)
- drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv);
+ sched_set_fifo_low(current);
while (get_t_state(thi) == RUNNING) {
drbd_thread_current_set_cpu(thi);
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index e6fc022bc87e..3939699e62fe 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -278,3 +278,14 @@ config EFI_EARLYCON
depends on SERIAL_EARLYCON && !ARM && !IA64
select FONT_SUPPORT
select ARCH_USE_MEMREMAP_PROT
+
+config EFI_CUSTOM_SSDT_OVERLAYS
+ bool "Load custom ACPI SSDT overlay from an EFI variable"
+ depends on EFI_VARS && ACPI
+ default ACPI_TABLE_UPGRADE
+ help
+ Allow loading of an ACPI SSDT overlay from an EFI variable specified
+ by a kernel command line option.
+
+ See Documentation/admin-guide/acpi/ssdt-overlays.rst for more
+ information.
diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c
index c697e70ca7e7..71c445d20258 100644
--- a/drivers/firmware/efi/arm-init.c
+++ b/drivers/firmware/efi/arm-init.c
@@ -52,9 +52,11 @@ static phys_addr_t __init efi_to_phys(unsigned long addr)
}
static __initdata unsigned long screen_info_table = EFI_INVALID_TABLE_ADDR;
+static __initdata unsigned long cpu_state_table = EFI_INVALID_TABLE_ADDR;
static const efi_config_table_type_t arch_tables[] __initconst = {
{LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, &screen_info_table},
+ {LINUX_EFI_ARM_CPU_STATE_TABLE_GUID, &cpu_state_table},
{}
};
@@ -62,7 +64,8 @@ static void __init init_screen_info(void)
{
struct screen_info *si;
- if (screen_info_table != EFI_INVALID_TABLE_ADDR) {
+ if (IS_ENABLED(CONFIG_ARM) &&
+ screen_info_table != EFI_INVALID_TABLE_ADDR) {
si = early_memremap_ro(screen_info_table, sizeof(*si));
if (!si) {
pr_err("Could not map screen_info config table\n");
@@ -116,7 +119,8 @@ static int __init uefi_init(u64 efi_system_table)
goto out;
}
retval = efi_config_parse_tables(config_tables, systab->nr_tables,
- arch_tables);
+ IS_ENABLED(CONFIG_ARM) ? arch_tables
+ : NULL);
early_memunmap(config_tables, table_size);
out:
@@ -238,9 +242,37 @@ void __init efi_init(void)
init_screen_info();
+#ifdef CONFIG_ARM
/* ARM does not permit early mappings to persist across paging_init() */
- if (IS_ENABLED(CONFIG_ARM))
- efi_memmap_unmap();
+ efi_memmap_unmap();
+
+ if (cpu_state_table != EFI_INVALID_TABLE_ADDR) {
+ struct efi_arm_entry_state *state;
+ bool dump_state = true;
+
+ state = early_memremap_ro(cpu_state_table,
+ sizeof(struct efi_arm_entry_state));
+ if (state == NULL) {
+ pr_warn("Unable to map CPU entry state table.\n");
+ return;
+ }
+
+ if ((state->sctlr_before_ebs & 1) == 0)
+ pr_warn(FW_BUG "EFI stub was entered with MMU and Dcache disabled, please fix your firmware!\n");
+ else if ((state->sctlr_after_ebs & 1) == 0)
+ pr_warn(FW_BUG "ExitBootServices() returned with MMU and Dcache disabled, please fix your firmware!\n");
+ else
+ dump_state = false;
+
+ if (dump_state || efi_enabled(EFI_DBG)) {
+ pr_info("CPSR at EFI stub entry : 0x%08x\n", state->cpsr_before_ebs);
+ pr_info("SCTLR at EFI stub entry : 0x%08x\n", state->sctlr_before_ebs);
+ pr_info("CPSR after ExitBootServices() : 0x%08x\n", state->cpsr_after_ebs);
+ pr_info("SCTLR after ExitBootServices(): 0x%08x\n", state->sctlr_after_ebs);
+ }
+ early_memunmap(state, sizeof(struct efi_arm_entry_state));
+ }
+#endif
}
static bool efifb_overlaps_pci_range(const struct of_pci_range *range)
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 7f1657b6c30d..5114cae4ec97 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -189,7 +189,7 @@ static void generic_ops_unregister(void)
efivars_unregister(&generic_efivars);
}
-#if IS_ENABLED(CONFIG_ACPI)
+#ifdef CONFIG_EFI_CUSTOM_SSDT_OVERLAYS
#define EFIVAR_SSDT_NAME_MAX 16
static char efivar_ssdt[EFIVAR_SSDT_NAME_MAX] __initdata;
static int __init efivar_ssdt_setup(char *str)
@@ -622,7 +622,8 @@ int __init efi_config_parse_tables(const efi_config_table_t *config_tables,
rsv = (void *)(p + prsv % PAGE_SIZE);
/* reserve the entry itself */
- memblock_reserve(prsv, EFI_MEMRESERVE_SIZE(rsv->size));
+ memblock_reserve(prsv,
+ struct_size(rsv, entry, rsv->size));
for (i = 0; i < atomic_read(&rsv->count); i++) {
memblock_reserve(rsv->entry[i].base,
diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c
index e3d692696583..d5915272141f 100644
--- a/drivers/firmware/efi/esrt.c
+++ b/drivers/firmware/efi/esrt.c
@@ -181,7 +181,7 @@ static int esre_create_sysfs_entry(void *esre, int entry_num)
rc = kobject_init_and_add(&entry->kobj, &esre1_ktype, NULL,
"entry%d", entry_num);
if (rc) {
- kfree(entry);
+ kobject_put(&entry->kobj);
return rc;
}
}
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 75daaf20374e..4cce372edaf4 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -6,7 +6,8 @@
# enabled, even if doing so doesn't break the build.
#
cflags-$(CONFIG_X86_32) := -march=i386
-cflags-$(CONFIG_X86_64) := -mcmodel=small
+cflags-$(CONFIG_X86_64) := -mcmodel=small \
+ $(call cc-option,-maccumulate-outgoing-args)
cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ \
-fPIC -fno-strict-aliasing -mno-red-zone \
-mno-mmx -mno-sse -fshort-wchar \
diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
index 40243f524556..d08e5d55838c 100644
--- a/drivers/firmware/efi/libstub/arm32-stub.c
+++ b/drivers/firmware/efi/libstub/arm32-stub.c
@@ -7,10 +7,49 @@
#include "efistub.h"
+static efi_guid_t cpu_state_guid = LINUX_EFI_ARM_CPU_STATE_TABLE_GUID;
+
+struct efi_arm_entry_state *efi_entry_state;
+
+static void get_cpu_state(u32 *cpsr, u32 *sctlr)
+{
+ asm("mrs %0, cpsr" : "=r"(*cpsr));
+ if ((*cpsr & MODE_MASK) == HYP_MODE)
+ asm("mrc p15, 4, %0, c1, c0, 0" : "=r"(*sctlr));
+ else
+ asm("mrc p15, 0, %0, c1, c0, 0" : "=r"(*sctlr));
+}
+
efi_status_t check_platform_features(void)
{
+ efi_status_t status;
+ u32 cpsr, sctlr;
int block;
+ get_cpu_state(&cpsr, &sctlr);
+
+ efi_info("Entering in %s mode with MMU %sabled\n",
+ ((cpsr & MODE_MASK) == HYP_MODE) ? "HYP" : "SVC",
+ (sctlr & 1) ? "en" : "dis");
+
+ status = efi_bs_call(allocate_pool, EFI_LOADER_DATA,
+ sizeof(*efi_entry_state),
+ (void **)&efi_entry_state);
+ if (status != EFI_SUCCESS) {
+ efi_err("allocate_pool() failed\n");
+ return status;
+ }
+
+ efi_entry_state->cpsr_before_ebs = cpsr;
+ efi_entry_state->sctlr_before_ebs = sctlr;
+
+ status = efi_bs_call(install_configuration_table, &cpu_state_guid,
+ efi_entry_state);
+ if (status != EFI_SUCCESS) {
+ efi_err("install_configuration_table() failed\n");
+ goto free_state;
+ }
+
/* non-LPAE kernels can run anywhere */
if (!IS_ENABLED(CONFIG_ARM_LPAE))
return EFI_SUCCESS;
@@ -19,9 +58,22 @@ efi_status_t check_platform_features(void)
block = cpuid_feature_extract(CPUID_EXT_MMFR0, 0);
if (block < 5) {
efi_err("This LPAE kernel is not supported by your CPU\n");
- return EFI_UNSUPPORTED;
+ status = EFI_UNSUPPORTED;
+ goto drop_table;
}
return EFI_SUCCESS;
+
+drop_table:
+ efi_bs_call(install_configuration_table, &cpu_state_guid, NULL);
+free_state:
+ efi_bs_call(free_pool, efi_entry_state);
+ return status;
+}
+
+void efi_handle_post_ebs_state(void)
+{
+ get_cpu_state(&efi_entry_state->cpsr_after_ebs,
+ &efi_entry_state->sctlr_after_ebs);
}
static efi_guid_t screen_info_guid = LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID;
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 89f075275300..d40fd68c6bb2 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -32,6 +32,10 @@ bool __pure __efi_soft_reserve_enabled(void)
return !efi_nosoftreserve;
}
+/**
+ * efi_char16_puts() - Write a UCS-2 encoded string to the console
+ * @str: UCS-2 encoded string
+ */
void efi_char16_puts(efi_char16_t *str)
{
efi_call_proto(efi_table_attr(efi_system_table, con_out),
@@ -83,6 +87,10 @@ u32 utf8_to_utf32(const u8 **s8)
return c32;
}
+/**
+ * efi_puts() - Write a UTF-8 encoded string to the console
+ * @str: UTF-8 encoded string
+ */
void efi_puts(const char *str)
{
efi_char16_t buf[128];
@@ -113,6 +121,16 @@ void efi_puts(const char *str)
}
}
+/**
+ * efi_printk() - Print a kernel message
+ * @fmt: format string
+ *
+ * The first letter of the format string is used to determine the logging level
+ * of the message. If the level is less then the current EFI logging level, the
+ * message is suppressed. The message will be truncated to 255 bytes.
+ *
+ * Return: number of printed characters
+ */
int efi_printk(const char *fmt, ...)
{
char printf_buf[256];
@@ -154,13 +172,18 @@ int efi_printk(const char *fmt, ...)
return printed;
}
-/*
- * Parse the ASCII string 'cmdline' for EFI options, denoted by the efi=
+/**
+ * efi_parse_options() - Parse EFI command line options
+ * @cmdline: kernel command line
+ *
+ * Parse the ASCII string @cmdline for EFI options, denoted by the efi=
* option, e.g. efi=nochunk.
*
* It should be noted that efi= is parsed in two very different
* environments, first in the early boot environment of the EFI boot
* stub, and subsequently during the kernel boot.
+ *
+ * Return: status code
*/
efi_status_t efi_parse_options(char const *cmdline)
{
@@ -286,13 +309,21 @@ char *efi_convert_cmdline(efi_loaded_image_t *image, int *cmd_line_len)
return (char *)cmdline_addr;
}
-/*
+/**
+ * efi_exit_boot_services() - Exit boot services
+ * @handle: handle of the exiting image
+ * @map: pointer to receive the memory map
+ * @priv: argument to be passed to @priv_func
+ * @priv_func: function to process the memory map before exiting boot services
+ *
* Handle calling ExitBootServices according to the requirements set out by the
* spec. Obtains the current memory map, and returns that info after calling
* ExitBootServices. The client must specify a function to perform any
* processing of the memory map data prior to ExitBootServices. A client
* specific structure may be passed to the function via priv. The client
* function may be called multiple times.
+ *
+ * Return: status code
*/
efi_status_t efi_exit_boot_services(void *handle,
struct efi_boot_memmap *map,
@@ -361,6 +392,11 @@ fail:
return status;
}
+/**
+ * get_efi_config_table() - retrieve UEFI configuration table
+ * @guid: GUID of the configuration table to be retrieved
+ * Return: pointer to the configuration table or NULL
+ */
void *get_efi_config_table(efi_guid_t guid)
{
unsigned long tables = efi_table_attr(efi_system_table, tables);
@@ -408,17 +444,18 @@ static const struct {
};
/**
- * efi_load_initrd_dev_path - load the initrd from the Linux initrd device path
+ * efi_load_initrd_dev_path() - load the initrd from the Linux initrd device path
* @load_addr: pointer to store the address where the initrd was loaded
* @load_size: pointer to store the size of the loaded initrd
* @max: upper limit for the initrd memory allocation
- * @return: %EFI_SUCCESS if the initrd was loaded successfully, in which
- * case @load_addr and @load_size are assigned accordingly
- * %EFI_NOT_FOUND if no LoadFile2 protocol exists on the initrd
- * device path
- * %EFI_INVALID_PARAMETER if load_addr == NULL or load_size == NULL
- * %EFI_OUT_OF_RESOURCES if memory allocation failed
- * %EFI_LOAD_ERROR in all other cases
+ *
+ * Return:
+ * * %EFI_SUCCESS if the initrd was loaded successfully, in which
+ * case @load_addr and @load_size are assigned accordingly
+ * * %EFI_NOT_FOUND if no LoadFile2 protocol exists on the initrd device path
+ * * %EFI_INVALID_PARAMETER if load_addr == NULL or load_size == NULL
+ * * %EFI_OUT_OF_RESOURCES if memory allocation failed
+ * * %EFI_LOAD_ERROR in all other cases
*/
static
efi_status_t efi_load_initrd_dev_path(unsigned long *load_addr,
@@ -481,6 +518,16 @@ efi_status_t efi_load_initrd_cmdline(efi_loaded_image_t *image,
load_addr, load_size);
}
+/**
+ * efi_load_initrd() - Load initial RAM disk
+ * @image: EFI loaded image protocol
+ * @load_addr: pointer to loaded initrd
+ * @load_size: size of loaded initrd
+ * @soft_limit: preferred size of allocated memory for loading the initrd
+ * @hard_limit: minimum size of allocated memory
+ *
+ * Return: status code
+ */
efi_status_t efi_load_initrd(efi_loaded_image_t *image,
unsigned long *load_addr,
unsigned long *load_size,
@@ -505,6 +552,15 @@ efi_status_t efi_load_initrd(efi_loaded_image_t *image,
return status;
}
+/**
+ * efi_wait_for_key() - Wait for key stroke
+ * @usec: number of microseconds to wait for key stroke
+ * @key: key entered
+ *
+ * Wait for up to @usec microseconds for a key stroke.
+ *
+ * Return: status code, EFI_SUCCESS if key received
+ */
efi_status_t efi_wait_for_key(unsigned long usec, efi_input_key_t *key)
{
efi_event_t events[2], timer;
diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c
index e97370bdfdb0..3318ec3f8e5b 100644
--- a/drivers/firmware/efi/libstub/efi-stub.c
+++ b/drivers/firmware/efi/libstub/efi-stub.c
@@ -329,6 +329,9 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
if (status != EFI_SUCCESS)
goto fail_free_initrd;
+ if (IS_ENABLED(CONFIG_ARM))
+ efi_handle_post_ebs_state();
+
efi_enter_kernel(image_addr, fdt_addr, fdt_totalsize((void *)fdt_addr));
/* not reached */
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index bcd8c0a785f0..2c9d42264c29 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -157,8 +157,14 @@ typedef void (__efiapi *efi_event_notify_t)(efi_event_t, void *);
#define EFI_EVT_NOTIFY_WAIT 0x00000100U
#define EFI_EVT_NOTIFY_SIGNAL 0x00000200U
-/*
- * boottime->wait_for_event takes an array of events as input.
+/**
+ * efi_set_event_at() - add event to events array
+ *
+ * @events: array of UEFI events
+ * @ids: index where to put the event in the array
+ * @event: event to add to the aray
+ *
+ * boottime->wait_for_event() takes an array of events as input.
* Provide a helper to set it up correctly for mixed mode.
*/
static inline
@@ -771,4 +777,6 @@ efi_status_t efi_load_initrd(efi_loaded_image_t *image,
unsigned long soft_limit,
unsigned long hard_limit);
+void efi_handle_post_ebs_state(void);
+
#endif
diff --git a/drivers/firmware/efi/libstub/file.c b/drivers/firmware/efi/libstub/file.c
index 2005e33b33d5..630caa6b1f4c 100644
--- a/drivers/firmware/efi/libstub/file.c
+++ b/drivers/firmware/efi/libstub/file.c
@@ -102,12 +102,20 @@ static int find_file_option(const efi_char16_t *cmdline, int cmdline_len,
if (!found)
return 0;
+ /* Skip any leading slashes */
+ while (cmdline[i] == L'/' || cmdline[i] == L'\\')
+ i++;
+
while (--result_len > 0 && i < cmdline_len) {
- if (cmdline[i] == L'\0' ||
- cmdline[i] == L'\n' ||
- cmdline[i] == L' ')
+ efi_char16_t c = cmdline[i++];
+
+ if (c == L'\0' || c == L'\n' || c == L' ')
break;
- *result++ = cmdline[i++];
+ else if (c == L'/')
+ /* Replace UNIX dir separators with EFI standard ones */
+ *result++ = L'\\';
+ else
+ *result++ = c;
}
*result = L'\0';
return i;
diff --git a/drivers/firmware/efi/libstub/skip_spaces.c b/drivers/firmware/efi/libstub/skip_spaces.c
index a700b3c7f7d0..159fb4e456c6 100644
--- a/drivers/firmware/efi/libstub/skip_spaces.c
+++ b/drivers/firmware/efi/libstub/skip_spaces.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/ctype.h>
+#include <linux/string.h>
#include <linux/types.h>
char *skip_spaces(const char *str)
diff --git a/drivers/firmware/psci/psci_checker.c b/drivers/firmware/psci/psci_checker.c
index 873841af8d57..6fff482847e7 100644
--- a/drivers/firmware/psci/psci_checker.c
+++ b/drivers/firmware/psci/psci_checker.c
@@ -272,7 +272,6 @@ static int suspend_test_thread(void *arg)
{
int cpu = (long)arg;
int i, nb_suspend = 0, nb_shallow_sleep = 0, nb_err = 0;
- struct sched_param sched_priority = { .sched_priority = MAX_RT_PRIO-1 };
struct cpuidle_device *dev;
struct cpuidle_driver *drv;
/* No need for an actual callback, we just want to wake up the CPU. */
@@ -282,9 +281,7 @@ static int suspend_test_thread(void *arg)
wait_for_completion(&suspend_threads_started);
/* Set maximum priority to preempt all other threads on this CPU. */
- if (sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_priority))
- pr_warn("Failed to set suspend thread scheduler on CPU %d\n",
- cpu);
+ sched_set_fifo(current);
dev = this_cpu_read(cpuidle_devices);
drv = cpuidle_get_cpu_driver(dev);
@@ -349,11 +346,6 @@ static int suspend_test_thread(void *arg)
if (atomic_dec_return_relaxed(&nb_active_threads) == 0)
complete(&suspend_threads_done);
- /* Give up on RT scheduling and wait for termination. */
- sched_priority.sched_priority = 0;
- if (sched_setscheduler_nocheck(current, SCHED_NORMAL, &sched_priority))
- pr_warn("Failed to set suspend thread scheduler on CPU %d\n",
- cpu);
for (;;) {
/* Needs to be set first to avoid missing a wakeup. */
set_current_state(TASK_INTERRUPTIBLE);
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index c981cc10aebf..6c57cc72d627 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -389,7 +389,6 @@ static int msm_drm_init(struct device *dev, struct drm_driver *drv)
struct msm_kms *kms;
struct msm_mdss *mdss;
int ret, i;
- struct sched_param param;
ddev = drm_dev_alloc(drv, dev);
if (IS_ERR(ddev)) {
@@ -495,12 +494,6 @@ static int msm_drm_init(struct device *dev, struct drm_driver *drv)
ddev->mode_config.funcs = &mode_config_funcs;
ddev->mode_config.helper_private = &mode_config_helper_funcs;
- /**
- * this priority was found during empiric testing to have appropriate
- * realtime scheduling to process display updates and interact with
- * other real time and normal priority task
- */
- param.sched_priority = 16;
for (i = 0; i < priv->num_crtcs; i++) {
/* initialize event thread */
priv->event_thread[i].crtc_id = priv->crtcs[i]->base.id;
@@ -516,11 +509,7 @@ static int msm_drm_init(struct device *dev, struct drm_driver *drv)
goto err_msm_uninit;
}
- ret = sched_setscheduler(priv->event_thread[i].thread,
- SCHED_FIFO, &param);
- if (ret)
- dev_warn(dev, "event_thread set priority failed:%d\n",
- ret);
+ sched_set_fifo(priv->event_thread[i].thread);
}
ret = drm_vblank_init(ddev, priv->num_crtcs);
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 2f319102ae9f..17cf77e2c4e5 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -760,11 +760,10 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
*/
static int drm_sched_main(void *param)
{
- struct sched_param sparam = {.sched_priority = 1};
struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
int r;
- sched_setscheduler(current, SCHED_FIFO, &sparam);
+ sched_set_fifo_low(current);
while (!kthread_should_stop()) {
struct drm_sched_entity *entity = NULL;
diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c
index 267eac00a3fb..29f5fed28c2a 100644
--- a/drivers/hwmon/fam15h_power.c
+++ b/drivers/hwmon/fam15h_power.c
@@ -41,10 +41,6 @@ MODULE_LICENSE("GPL");
/* set maximum interval as 1 second */
#define MAX_INTERVAL 1000
-#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a
-#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b
-#define MSR_F15H_PTSC 0xc0010280
-
#define PCI_DEVICE_ID_AMD_15H_M70H_NB_F4 0x15b4
struct fam15h_power_data {
diff --git a/drivers/media/pci/ivtv/ivtv-driver.c b/drivers/media/pci/ivtv/ivtv-driver.c
index b1dde1be6f5e..28acb14490d5 100644
--- a/drivers/media/pci/ivtv/ivtv-driver.c
+++ b/drivers/media/pci/ivtv/ivtv-driver.c
@@ -737,8 +737,6 @@ done:
*/
static int ivtv_init_struct1(struct ivtv *itv)
{
- struct sched_param param = { .sched_priority = 99 };
-
itv->base_addr = pci_resource_start(itv->pdev, 0);
itv->enc_mbox.max_mbox = 2; /* the encoder has 3 mailboxes (0-2) */
itv->dec_mbox.max_mbox = 1; /* the decoder has 2 mailboxes (0-1) */
@@ -758,7 +756,7 @@ static int ivtv_init_struct1(struct ivtv *itv)
return -1;
}
/* must use the FIFO scheduler as it is realtime sensitive */
- sched_setscheduler(itv->irq_worker_task, SCHED_FIFO, &param);
+ sched_set_fifo(itv->irq_worker_task);
kthread_init_work(&itv->irq_work, ivtv_irq_work_handler);
diff --git a/drivers/mmc/core/sdio_irq.c b/drivers/mmc/core/sdio_irq.c
index 3ffe4ff49aa7..4b1f7c966ec8 100644
--- a/drivers/mmc/core/sdio_irq.c
+++ b/drivers/mmc/core/sdio_irq.c
@@ -139,11 +139,10 @@ EXPORT_SYMBOL_GPL(sdio_signal_irq);
static int sdio_irq_thread(void *_host)
{
struct mmc_host *host = _host;
- struct sched_param param = { .sched_priority = 1 };
unsigned long period, idle_period;
int ret;
- sched_setscheduler(current, SCHED_FIFO, &param);
+ sched_set_fifo_low(current);
/*
* We want to allow for SDIO cards to work even on non SDIO
diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c
index debea5c4c829..d09260382550 100644
--- a/drivers/platform/chrome/cros_ec_spi.c
+++ b/drivers/platform/chrome/cros_ec_spi.c
@@ -709,9 +709,6 @@ static void cros_ec_spi_high_pri_release(void *worker)
static int cros_ec_spi_devm_high_pri_alloc(struct device *dev,
struct cros_ec_spi *ec_spi)
{
- struct sched_param sched_priority = {
- .sched_priority = MAX_RT_PRIO / 2,
- };
int err;
ec_spi->high_pri_worker =
@@ -728,11 +725,9 @@ static int cros_ec_spi_devm_high_pri_alloc(struct device *dev,
if (err)
return err;
- err = sched_setscheduler_nocheck(ec_spi->high_pri_worker->task,
- SCHED_FIFO, &sched_priority);
- if (err)
- dev_err(dev, "Can't set cros_ec high pri priority: %d\n", err);
- return err;
+ sched_set_fifo(ec_spi->high_pri_worker->task);
+
+ return 0;
}
static int cros_ec_spi_probe(struct spi_device *spi)
diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c
index c90f0990968b..bc396a8231bf 100644
--- a/drivers/powercap/idle_inject.c
+++ b/drivers/powercap/idle_inject.c
@@ -268,9 +268,7 @@ void idle_inject_stop(struct idle_inject_device *ii_dev)
*/
static void idle_inject_setup(unsigned int cpu)
{
- struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
-
- sched_setscheduler(current, SCHED_FIFO, &param);
+ sched_set_fifo(current);
}
/**
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 8158e281f354..5a4f0bfce474 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1592,11 +1592,9 @@ EXPORT_SYMBOL_GPL(spi_take_timestamp_post);
*/
static void spi_set_thread_rt(struct spi_controller *ctlr)
{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO / 2 };
-
dev_info(&ctlr->dev,
"will run message pump with realtime priority\n");
- sched_setscheduler(ctlr->kworker_task, SCHED_FIFO, &param);
+ sched_set_fifo(ctlr->kworker_task);
}
static int spi_init_queue(struct spi_controller *ctlr)
diff --git a/drivers/staging/android/ion/ion_heap.c b/drivers/staging/android/ion/ion_heap.c
index 0755b11348ed..563b84cda2f0 100644
--- a/drivers/staging/android/ion/ion_heap.c
+++ b/drivers/staging/android/ion/ion_heap.c
@@ -244,8 +244,6 @@ static int ion_heap_deferred_free(void *data)
int ion_heap_init_deferred_free(struct ion_heap *heap)
{
- struct sched_param param = { .sched_priority = 0 };
-
INIT_LIST_HEAD(&heap->free_list);
init_waitqueue_head(&heap->waitqueue);
heap->task = kthread_run(ion_heap_deferred_free, heap,
@@ -255,7 +253,7 @@ int ion_heap_init_deferred_free(struct ion_heap *heap)
__func__);
return PTR_ERR_OR_ZERO(heap->task);
}
- sched_setscheduler(heap->task, SCHED_IDLE, &param);
+ sched_set_normal(heap->task, 19);
return 0;
}
diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c
index f74b2473440d..b0eb5ece9243 100644
--- a/drivers/thermal/intel/intel_powerclamp.c
+++ b/drivers/thermal/intel/intel_powerclamp.c
@@ -70,9 +70,6 @@ static unsigned int control_cpu; /* The cpu assigned to collect stat and update
*/
static bool clamping;
-static const struct sched_param sparam = {
- .sched_priority = MAX_USER_RT_PRIO / 2,
-};
struct powerclamp_worker_data {
struct kthread_worker *worker;
struct kthread_work balancing_work;
@@ -488,7 +485,7 @@ static void start_power_clamp_worker(unsigned long cpu)
w_data->cpu = cpu;
w_data->clamping = true;
set_bit(cpu, cpu_clamping_mask);
- sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
+ sched_set_fifo(worker->task);
kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
kthread_init_delayed_work(&w_data->idle_injection_work,
clamp_idle_injection_func);
diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index d2e5c6c86643..809610b37c71 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -1179,7 +1179,6 @@ static int sc16is7xx_probe(struct device *dev,
const struct sc16is7xx_devtype *devtype,
struct regmap *regmap, int irq)
{
- struct sched_param sched_param = { .sched_priority = MAX_RT_PRIO / 2 };
unsigned long freq = 0, *pfreq = dev_get_platdata(dev);
unsigned int val;
u32 uartclk = 0;
@@ -1239,7 +1238,7 @@ static int sc16is7xx_probe(struct device *dev,
ret = PTR_ERR(s->kworker_task);
goto out_clk;
}
- sched_setscheduler(s->kworker_task, SCHED_FIFO, &sched_param);
+ sched_set_fifo(s->kworker_task);
#ifdef CONFIG_GPIOLIB
if (devtype->nr_gpio) {
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 7e4cd34a8c20..b9dc2c352151 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -1144,14 +1144,13 @@ void watchdog_dev_unregister(struct watchdog_device *wdd)
int __init watchdog_dev_init(void)
{
int err;
- struct sched_param param = {.sched_priority = MAX_RT_PRIO - 1,};
watchdog_kworker = kthread_create_worker(0, "watchdogd");
if (IS_ERR(watchdog_kworker)) {
pr_err("Failed to create watchdog kworker\n");
return PTR_ERR(watchdog_kworker);
}
- sched_setscheduler(watchdog_kworker->task, SCHED_FIFO, &param);
+ sched_set_fifo(watchdog_kworker->task);
err = class_register(&watchdog_class);
if (err < 0) {
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index e9e27a271af0..feaa5e182b7b 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -51,6 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file,
} else {
inode_lock(inode);
i_size_write(inode, datasize + sizeof(attributes));
+ inode->i_mtime = current_time(inode);
inode_unlock(inode);
}
@@ -72,10 +73,8 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
ssize_t size = 0;
int err;
- while (!__ratelimit(&file->f_cred->user->ratelimit)) {
- if (!msleep_interruptible(50))
- return -EINTR;
- }
+ while (!__ratelimit(&file->f_cred->user->ratelimit))
+ msleep(50);
err = efivar_entry_size(var, &datasize);
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index ee37256ec8bd..5e55302e3bf6 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -33,6 +33,14 @@
#define __no_sanitize_thread
#endif
+#if __has_feature(undefined_behavior_sanitizer)
+/* GCC does not have __SANITIZE_UNDEFINED__ */
+#define __no_sanitize_undefined \
+ __attribute__((no_sanitize("undefined")))
+#else
+#define __no_sanitize_undefined
+#endif
+
/*
* Not all versions of clang implement the the type-generic versions
* of the builtin overflow checkers. Fortunately, clang implements
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 7dd4e0349ef3..1c74464c80c6 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -150,6 +150,12 @@
#define __no_sanitize_thread
#endif
+#if __has_attribute(__no_sanitize_undefined__)
+#define __no_sanitize_undefined __attribute__((no_sanitize_undefined))
+#else
+#define __no_sanitize_undefined
+#endif
+
#if GCC_VERSION >= 50100
#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
#endif
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index cdf016596659..c8f03d2969df 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -40,6 +40,7 @@
# define __GCC4_has_attribute___noclone__ 1
# define __GCC4_has_attribute___nonstring__ 0
# define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8)
+# define __GCC4_has_attribute___no_sanitize_undefined__ (__GNUC_MINOR__ >= 9)
# define __GCC4_has_attribute___fallthrough__ 0
#endif
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index e368384445b6..c3bf7710f69a 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -118,10 +118,6 @@ struct ftrace_likely_data {
#define notrace __attribute__((__no_instrument_function__))
#endif
-/* Section for code which can't be instrumented at all */
-#define noinstr \
- noinline notrace __attribute((__section__(".noinstr.text")))
-
/*
* it doesn't make sense on ARM (currently the only user of __naked)
* to trace naked functions because then mcount is called without
@@ -193,16 +189,18 @@ struct ftrace_likely_data {
#define __no_kcsan __no_sanitize_thread
#ifdef __SANITIZE_THREAD__
-# define __no_kcsan_or_inline __no_kcsan notrace __maybe_unused
-# define __no_sanitize_or_inline __no_kcsan_or_inline
-#else
-# define __no_kcsan_or_inline __always_inline
+# define __no_sanitize_or_inline __no_kcsan notrace __maybe_unused
#endif
#ifndef __no_sanitize_or_inline
#define __no_sanitize_or_inline __always_inline
#endif
+/* Section for code which can't be instrumented at all */
+#define noinstr \
+ noinline notrace __attribute((__section__(".noinstr.text"))) \
+ __no_kcsan __no_sanitize_address
+
#endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 52692587f7fe..8aa84c052fdf 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -64,6 +64,7 @@ extern ssize_t cpu_show_tsx_async_abort(struct device *dev,
char *buf);
extern ssize_t cpu_show_itlb_multihit(struct device *dev,
struct device_attribute *attr, char *buf);
+extern ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf);
extern __printf(4, 5)
struct device *cpu_device_create(struct device *parent, void *drvdata,
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 2c6495f72f79..bb35f3305e55 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -350,6 +350,7 @@ void efi_native_runtime_setup(void);
* associated with ConOut
*/
#define LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID EFI_GUID(0xe03fc20a, 0x85dc, 0x406e, 0xb9, 0x0e, 0x4a, 0xb5, 0x02, 0x37, 0x1d, 0x95)
+#define LINUX_EFI_ARM_CPU_STATE_TABLE_GUID EFI_GUID(0xef79e4aa, 0x3c3d, 0x4989, 0xb9, 0x02, 0x07, 0xa9, 0x43, 0xe5, 0x50, 0xd2)
#define LINUX_EFI_LOADER_ENTRY_GUID EFI_GUID(0x4a67b082, 0x0a4c, 0x41cf, 0xb6, 0xc7, 0x44, 0x0b, 0x29, 0xbb, 0x8c, 0x4f)
#define LINUX_EFI_RANDOM_SEED_TABLE_GUID EFI_GUID(0x1ce1e5bc, 0x7ceb, 0x42f2, 0x81, 0xe5, 0x8a, 0xad, 0xf1, 0x80, 0xf5, 0x7b)
#define LINUX_EFI_TPM_EVENT_LOG_GUID EFI_GUID(0xb7799cb0, 0xeca2, 0x4943, 0x96, 0x67, 0x1f, 0xae, 0x07, 0xb7, 0x47, 0xfa)
@@ -1236,14 +1237,11 @@ struct linux_efi_memreserve {
struct {
phys_addr_t base;
phys_addr_t size;
- } entry[0];
+ } entry[];
};
-#define EFI_MEMRESERVE_SIZE(count) (sizeof(struct linux_efi_memreserve) + \
- (count) * sizeof(((struct linux_efi_memreserve *)0)->entry[0]))
-
#define EFI_MEMRESERVE_COUNT(size) (((size) - sizeof(struct linux_efi_memreserve)) \
- / sizeof(((struct linux_efi_memreserve *)0)->entry[0]))
+ / sizeof_field(struct linux_efi_memreserve, entry[0]))
void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size);
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e339dac91ee6..ce2c06f72e86 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -58,9 +58,6 @@ struct ftrace_direct_func;
const char *
ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
unsigned long *off, char **modname, char *sym);
-int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
- char *type, char *name,
- char *module_name, int *exported);
#else
static inline const char *
ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
@@ -68,6 +65,13 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
{
return NULL;
}
+#endif
+
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name,
+ char *module_name, int *exported);
+#else
static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
char *type, char *name,
char *module_name, int *exported)
@@ -76,7 +80,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val
}
#endif
-
#ifdef CONFIG_FUNCTION_TRACER
extern int ftrace_enabled;
@@ -207,6 +210,7 @@ struct ftrace_ops {
struct ftrace_ops_hash old_hash;
unsigned long trampoline;
unsigned long trampoline_size;
+ struct list_head list;
#endif
};
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 2735da5f839e..30823780c192 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -2,7 +2,7 @@
#ifndef _LINUX_IRQ_WORK_H
#define _LINUX_IRQ_WORK_H
-#include <linux/llist.h>
+#include <linux/smp_types.h>
/*
* An entry can be in one of four states:
@@ -13,24 +13,14 @@
* busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
*/
-/* flags share CSD_FLAG_ space */
-
-#define IRQ_WORK_PENDING BIT(0)
-#define IRQ_WORK_BUSY BIT(1)
-
-/* Doesn't want IPI, wait for tick: */
-#define IRQ_WORK_LAZY BIT(2)
-/* Run hard IRQ context, even on RT */
-#define IRQ_WORK_HARD_IRQ BIT(3)
-
-#define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY)
-
-/*
- * structure shares layout with single_call_data_t.
- */
struct irq_work {
- struct llist_node llnode;
- atomic_t flags;
+ union {
+ struct __call_single_node node;
+ struct {
+ struct llist_node llnode;
+ atomic_t flags;
+ };
+ };
void (*func)(struct irq_work *);
};
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 6adf90f248d7..45b8cdc9fad7 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -242,6 +242,7 @@ struct kprobe_insn_cache {
struct mutex mutex;
void *(*alloc)(void); /* allocate insn page */
void (*free)(void *); /* free insn page */
+ const char *sym; /* symbol for insn pages */
struct list_head pages; /* list of kprobe_insn_page */
size_t insn_size; /* size of instruction slot */
int nr_garbage;
@@ -272,6 +273,10 @@ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \
{ \
return __is_insn_slot_addr(&kprobe_##__name##_slots, addr); \
}
+#define KPROBE_INSN_PAGE_SYM "kprobe_insn_page"
+#define KPROBE_OPTINSN_PAGE_SYM "kprobe_optinsn_page"
+int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+ unsigned long *value, char *type, char *sym);
#else /* __ARCH_WANT_KPROBES_INSN_SLOT */
#define DEFINE_INSN_CACHE_OPS(__name) \
static inline bool is_kprobe_##__name##_slot(unsigned long addr) \
@@ -377,6 +382,11 @@ void dump_kprobe(struct kprobe *kp);
void *alloc_insn_page(void);
void free_insn_page(void *page);
+int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *sym);
+
+int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+ char *type, char *sym);
#else /* !CONFIG_KPROBES: */
static inline int kprobes_built_in(void)
@@ -439,6 +449,11 @@ static inline bool within_kprobe_blacklist(unsigned long addr)
{
return true;
}
+static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *sym)
+{
+ return -ERANGE;
+}
#endif /* CONFIG_KPROBES */
static inline int disable_kretprobe(struct kretprobe *rp)
{
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 8fce5c98a4b0..3b73cf84f77d 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -10,181 +10,20 @@
#ifndef __LINUX_LOCKDEP_H
#define __LINUX_LOCKDEP_H
+#include <linux/lockdep_types.h>
+
struct task_struct;
-struct lockdep_map;
/* for sysctl */
extern int prove_locking;
extern int lock_stat;
-#define MAX_LOCKDEP_SUBCLASSES 8UL
-
-#include <linux/types.h>
-
-enum lockdep_wait_type {
- LD_WAIT_INV = 0, /* not checked, catch all */
-
- LD_WAIT_FREE, /* wait free, rcu etc.. */
- LD_WAIT_SPIN, /* spin loops, raw_spinlock_t etc.. */
-
-#ifdef CONFIG_PROVE_RAW_LOCK_NESTING
- LD_WAIT_CONFIG, /* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */
-#else
- LD_WAIT_CONFIG = LD_WAIT_SPIN,
-#endif
- LD_WAIT_SLEEP, /* sleeping locks, mutex_t etc.. */
-
- LD_WAIT_MAX, /* must be last */
-};
-
#ifdef CONFIG_LOCKDEP
#include <linux/linkage.h>
-#include <linux/list.h>
#include <linux/debug_locks.h>
#include <linux/stacktrace.h>
-/*
- * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
- * the total number of states... :-(
- */
-#define XXX_LOCK_USAGE_STATES (1+2*4)
-
-/*
- * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
- * cached in the instance of lockdep_map
- *
- * Currently main class (subclass == 0) and signle depth subclass
- * are cached in lockdep_map. This optimization is mainly targeting
- * on rq->lock. double_rq_lock() acquires this highly competitive with
- * single depth.
- */
-#define NR_LOCKDEP_CACHING_CLASSES 2
-
-/*
- * A lockdep key is associated with each lock object. For static locks we use
- * the lock address itself as the key. Dynamically allocated lock objects can
- * have a statically or dynamically allocated key. Dynamically allocated lock
- * keys must be registered before being used and must be unregistered before
- * the key memory is freed.
- */
-struct lockdep_subclass_key {
- char __one_byte;
-} __attribute__ ((__packed__));
-
-/* hash_entry is used to keep track of dynamically allocated keys. */
-struct lock_class_key {
- union {
- struct hlist_node hash_entry;
- struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES];
- };
-};
-
-extern struct lock_class_key __lockdep_no_validate__;
-
-struct lock_trace;
-
-#define LOCKSTAT_POINTS 4
-
-/*
- * The lock-class itself. The order of the structure members matters.
- * reinit_class() zeroes the key member and all subsequent members.
- */
-struct lock_class {
- /*
- * class-hash:
- */
- struct hlist_node hash_entry;
-
- /*
- * Entry in all_lock_classes when in use. Entry in free_lock_classes
- * when not in use. Instances that are being freed are on one of the
- * zapped_classes lists.
- */
- struct list_head lock_entry;
-
- /*
- * These fields represent a directed graph of lock dependencies,
- * to every node we attach a list of "forward" and a list of
- * "backward" graph nodes.
- */
- struct list_head locks_after, locks_before;
-
- const struct lockdep_subclass_key *key;
- unsigned int subclass;
- unsigned int dep_gen_id;
-
- /*
- * IRQ/softirq usage tracking bits:
- */
- unsigned long usage_mask;
- const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES];
-
- /*
- * Generation counter, when doing certain classes of graph walking,
- * to ensure that we check one node only once:
- */
- int name_version;
- const char *name;
-
- short wait_type_inner;
- short wait_type_outer;
-
-#ifdef CONFIG_LOCK_STAT
- unsigned long contention_point[LOCKSTAT_POINTS];
- unsigned long contending_point[LOCKSTAT_POINTS];
-#endif
-} __no_randomize_layout;
-
-#ifdef CONFIG_LOCK_STAT
-struct lock_time {
- s64 min;
- s64 max;
- s64 total;
- unsigned long nr;
-};
-
-enum bounce_type {
- bounce_acquired_write,
- bounce_acquired_read,
- bounce_contended_write,
- bounce_contended_read,
- nr_bounce_types,
-
- bounce_acquired = bounce_acquired_write,
- bounce_contended = bounce_contended_write,
-};
-
-struct lock_class_stats {
- unsigned long contention_point[LOCKSTAT_POINTS];
- unsigned long contending_point[LOCKSTAT_POINTS];
- struct lock_time read_waittime;
- struct lock_time write_waittime;
- struct lock_time read_holdtime;
- struct lock_time write_holdtime;
- unsigned long bounces[nr_bounce_types];
-};
-
-struct lock_class_stats lock_stats(struct lock_class *class);
-void clear_lock_stats(struct lock_class *class);
-#endif
-
-/*
- * Map the lock object (the lock instance) to the lock-class object.
- * This is embedded into specific lock instances:
- */
-struct lockdep_map {
- struct lock_class_key *key;
- struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES];
- const char *name;
- short wait_type_outer; /* can be taken in this context */
- short wait_type_inner; /* presents this context */
-#ifdef CONFIG_LOCK_STAT
- int cpu;
- unsigned long ip;
-#endif
-};
-
static inline void lockdep_copy_map(struct lockdep_map *to,
struct lockdep_map *from)
{
@@ -440,8 +279,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);
-struct pin_cookie { unsigned int val; };
-
#define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
@@ -520,10 +357,6 @@ static inline void lockdep_set_selftest_task(struct task_struct *task)
# define lockdep_reset() do { debug_locks = 1; } while (0)
# define lockdep_free_key_range(start, size) do { } while (0)
# define lockdep_sys_exit() do { } while (0)
-/*
- * The class key takes no space if lockdep is disabled:
- */
-struct lock_class_key { };
static inline void lockdep_register_key(struct lock_class_key *key)
{
@@ -533,11 +366,6 @@ static inline void lockdep_unregister_key(struct lock_class_key *key)
{
}
-/*
- * The lockdep_map takes no space if lockdep is disabled:
- */
-struct lockdep_map { };
-
#define lockdep_depth(tsk) (0)
#define lockdep_is_held_type(l, r) (1)
@@ -549,8 +377,6 @@ struct lockdep_map { };
#define lockdep_recursing(tsk) (0)
-struct pin_cookie { };
-
#define NIL_COOKIE (struct pin_cookie){ }
#define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; })
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
new file mode 100644
index 000000000000..7b9350624577
--- /dev/null
+++ b/include/linux/lockdep_types.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Runtime locking correctness validator
+ *
+ * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
+ *
+ * see Documentation/locking/lockdep-design.rst for more details.
+ */
+#ifndef __LINUX_LOCKDEP_TYPES_H
+#define __LINUX_LOCKDEP_TYPES_H
+
+#include <linux/types.h>
+
+#define MAX_LOCKDEP_SUBCLASSES 8UL
+
+enum lockdep_wait_type {
+ LD_WAIT_INV = 0, /* not checked, catch all */
+
+ LD_WAIT_FREE, /* wait free, rcu etc.. */
+ LD_WAIT_SPIN, /* spin loops, raw_spinlock_t etc.. */
+
+#ifdef CONFIG_PROVE_RAW_LOCK_NESTING
+ LD_WAIT_CONFIG, /* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */
+#else
+ LD_WAIT_CONFIG = LD_WAIT_SPIN,
+#endif
+ LD_WAIT_SLEEP, /* sleeping locks, mutex_t etc.. */
+
+ LD_WAIT_MAX, /* must be last */
+};
+
+#ifdef CONFIG_LOCKDEP
+
+#include <linux/list.h>
+
+/*
+ * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
+ * the total number of states... :-(
+ */
+#define XXX_LOCK_USAGE_STATES (1+2*4)
+
+/*
+ * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
+ * cached in the instance of lockdep_map
+ *
+ * Currently main class (subclass == 0) and signle depth subclass
+ * are cached in lockdep_map. This optimization is mainly targeting
+ * on rq->lock. double_rq_lock() acquires this highly competitive with
+ * single depth.
+ */
+#define NR_LOCKDEP_CACHING_CLASSES 2
+
+/*
+ * A lockdep key is associated with each lock object. For static locks we use
+ * the lock address itself as the key. Dynamically allocated lock objects can
+ * have a statically or dynamically allocated key. Dynamically allocated lock
+ * keys must be registered before being used and must be unregistered before
+ * the key memory is freed.
+ */
+struct lockdep_subclass_key {
+ char __one_byte;
+} __attribute__ ((__packed__));
+
+/* hash_entry is used to keep track of dynamically allocated keys. */
+struct lock_class_key {
+ union {
+ struct hlist_node hash_entry;
+ struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES];
+ };
+};
+
+extern struct lock_class_key __lockdep_no_validate__;
+
+struct lock_trace;
+
+#define LOCKSTAT_POINTS 4
+
+/*
+ * The lock-class itself. The order of the structure members matters.
+ * reinit_class() zeroes the key member and all subsequent members.
+ */
+struct lock_class {
+ /*
+ * class-hash:
+ */
+ struct hlist_node hash_entry;
+
+ /*
+ * Entry in all_lock_classes when in use. Entry in free_lock_classes
+ * when not in use. Instances that are being freed are on one of the
+ * zapped_classes lists.
+ */
+ struct list_head lock_entry;
+
+ /*
+ * These fields represent a directed graph of lock dependencies,
+ * to every node we attach a list of "forward" and a list of
+ * "backward" graph nodes.
+ */
+ struct list_head locks_after, locks_before;
+
+ const struct lockdep_subclass_key *key;
+ unsigned int subclass;
+ unsigned int dep_gen_id;
+
+ /*
+ * IRQ/softirq usage tracking bits:
+ */
+ unsigned long usage_mask;
+ const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES];
+
+ /*
+ * Generation counter, when doing certain classes of graph walking,
+ * to ensure that we check one node only once:
+ */
+ int name_version;
+ const char *name;
+
+ short wait_type_inner;
+ short wait_type_outer;
+
+#ifdef CONFIG_LOCK_STAT
+ unsigned long contention_point[LOCKSTAT_POINTS];
+ unsigned long contending_point[LOCKSTAT_POINTS];
+#endif
+} __no_randomize_layout;
+
+#ifdef CONFIG_LOCK_STAT
+struct lock_time {
+ s64 min;
+ s64 max;
+ s64 total;
+ unsigned long nr;
+};
+
+enum bounce_type {
+ bounce_acquired_write,
+ bounce_acquired_read,
+ bounce_contended_write,
+ bounce_contended_read,
+ nr_bounce_types,
+
+ bounce_acquired = bounce_acquired_write,
+ bounce_contended = bounce_contended_write,
+};
+
+struct lock_class_stats {
+ unsigned long contention_point[LOCKSTAT_POINTS];
+ unsigned long contending_point[LOCKSTAT_POINTS];
+ struct lock_time read_waittime;
+ struct lock_time write_waittime;
+ struct lock_time read_holdtime;
+ struct lock_time write_holdtime;
+ unsigned long bounces[nr_bounce_types];
+};
+
+struct lock_class_stats lock_stats(struct lock_class *class);
+void clear_lock_stats(struct lock_class *class);
+#endif
+
+/*
+ * Map the lock object (the lock instance) to the lock-class object.
+ * This is embedded into specific lock instances:
+ */
+struct lockdep_map {
+ struct lock_class_key *key;
+ struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES];
+ const char *name;
+ short wait_type_outer; /* can be taken in this context */
+ short wait_type_inner; /* presents this context */
+#ifdef CONFIG_LOCK_STAT
+ int cpu;
+ unsigned long ip;
+#endif
+};
+
+struct pin_cookie { unsigned int val; };
+
+#else /* !CONFIG_LOCKDEP */
+
+/*
+ * The class key takes no space if lockdep is disabled:
+ */
+struct lock_class_key { };
+
+/*
+ * The lockdep_map takes no space if lockdep is disabled:
+ */
+struct lockdep_map { };
+
+struct pin_cookie { };
+
+#endif /* !LOCKDEP */
+
+#endif /* __LINUX_LOCKDEP_TYPES_H */
diff --git a/include/linux/math64.h b/include/linux/math64.h
index 11a267413e8e..d097119419e6 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -263,6 +263,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor)
}
#endif /* mul_u64_u32_div */
+u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div);
+
#define DIV64_U64_ROUND_UP(ll, d) \
({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); })
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b4bb32082342..46fe5cfb5163 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1232,6 +1232,9 @@ extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
+extern void perf_event_text_poke(const void *addr,
+ const void *old_bytes, size_t old_len,
+ const void *new_bytes, size_t new_len);
/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
@@ -1479,6 +1482,11 @@ static inline void perf_event_exec(void) { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec) { }
static inline void perf_event_namespaces(struct task_struct *tsk) { }
static inline void perf_event_fork(struct task_struct *tsk) { }
+static inline void perf_event_text_poke(const void *addr,
+ const void *old_bytes,
+ size_t old_len,
+ const void *new_bytes,
+ size_t new_len) { }
static inline void perf_event_init(void) { }
static inline int perf_swevent_get_recursion_context(void) { return -1; }
static inline void perf_swevent_put_recursion_context(int rctx) { }
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 4b7258495a04..b95f3211566a 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -153,9 +153,10 @@ struct psi_group {
unsigned long avg[NR_PSI_STATES - 1][3];
/* Monitor work control */
- atomic_t poll_scheduled;
- struct kthread_worker __rcu *poll_kworker;
- struct kthread_delayed_work poll_work;
+ struct task_struct __rcu *poll_task;
+ struct timer_list poll_timer;
+ wait_queue_head_t poll_wait;
+ atomic_t poll_wakeup;
/* Protects data used by the monitor */
struct mutex trigger_lock;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 224b5de568e7..9bd073a10224 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -654,11 +654,8 @@ struct task_struct {
unsigned int ptrace;
#ifdef CONFIG_SMP
- struct {
- struct llist_node wake_entry;
- unsigned int wake_entry_type;
- };
int on_cpu;
+ struct __call_single_node wake_entry;
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */
unsigned int cpu;
@@ -1655,6 +1652,9 @@ extern int idle_cpu(int cpu);
extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
+extern void sched_set_fifo(struct task_struct *p);
+extern void sched_set_fifo_low(struct task_struct *p);
+extern void sched_set_normal(struct task_struct *p, int nice);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 0fbcbacd1b29..cc9f393e2a70 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -14,6 +14,7 @@ enum hk_flags {
HK_FLAG_DOMAIN = (1 << 5),
HK_FLAG_WQ = (1 << 6),
HK_FLAG_MANAGED_IRQ = (1 << 7),
+ HK_FLAG_KTHREAD = (1 << 8),
};
#ifdef CONFIG_CPU_ISOLATION
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 660ac49f2b53..24be30a40814 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -61,6 +61,9 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
+extern unsigned int sysctl_sched_dl_period_max;
+extern unsigned int sysctl_sched_dl_period_min;
+
#ifdef CONFIG_UCLAMP_TASK
extern unsigned int sysctl_sched_uclamp_util_min;
extern unsigned int sysctl_sched_uclamp_util_max;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 7ee202ad21a6..80d557ef8a11 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -12,29 +12,22 @@
#include <linux/list.h>
#include <linux/cpumask.h>
#include <linux/init.h>
-#include <linux/llist.h>
+#include <linux/smp_types.h>
typedef void (*smp_call_func_t)(void *info);
typedef bool (*smp_cond_func_t)(int cpu, void *info);
-enum {
- CSD_FLAG_LOCK = 0x01,
-
- /* IRQ_WORK_flags */
-
- CSD_TYPE_ASYNC = 0x00,
- CSD_TYPE_SYNC = 0x10,
- CSD_TYPE_IRQ_WORK = 0x20,
- CSD_TYPE_TTWU = 0x30,
- CSD_FLAG_TYPE_MASK = 0xF0,
-};
-
/*
* structure shares (partial) layout with struct irq_work
*/
struct __call_single_data {
- struct llist_node llist;
- unsigned int flags;
+ union {
+ struct __call_single_node node;
+ struct {
+ struct llist_node llist;
+ unsigned int flags;
+ };
+ };
smp_call_func_t func;
void *info;
};
diff --git a/include/linux/smp_types.h b/include/linux/smp_types.h
new file mode 100644
index 000000000000..364b3ae3e41d
--- /dev/null
+++ b/include/linux/smp_types.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_SMP_TYPES_H
+#define __LINUX_SMP_TYPES_H
+
+#include <linux/llist.h>
+
+enum {
+ CSD_FLAG_LOCK = 0x01,
+
+ IRQ_WORK_PENDING = 0x01,
+ IRQ_WORK_BUSY = 0x02,
+ IRQ_WORK_LAZY = 0x04, /* No IPI, wait for tick */
+ IRQ_WORK_HARD_IRQ = 0x08, /* IRQ context on PREEMPT_RT */
+
+ IRQ_WORK_CLAIMED = (IRQ_WORK_PENDING | IRQ_WORK_BUSY),
+
+ CSD_TYPE_ASYNC = 0x00,
+ CSD_TYPE_SYNC = 0x10,
+ CSD_TYPE_IRQ_WORK = 0x20,
+ CSD_TYPE_TTWU = 0x30,
+
+ CSD_FLAG_TYPE_MASK = 0xF0,
+};
+
+/*
+ * struct __call_single_node is the primary type on
+ * smp.c:call_single_queue.
+ *
+ * flush_smp_call_function_queue() only reads the type from
+ * __call_single_node::u_flags as a regular load, the above
+ * (anonymous) enum defines all the bits of this word.
+ *
+ * Other bits are not modified until the type is known.
+ *
+ * CSD_TYPE_SYNC/ASYNC:
+ * struct {
+ * struct llist_node node;
+ * unsigned int flags;
+ * smp_call_func_t func;
+ * void *info;
+ * };
+ *
+ * CSD_TYPE_IRQ_WORK:
+ * struct {
+ * struct llist_node node;
+ * atomic_t flags;
+ * void (*func)(struct irq_work *);
+ * };
+ *
+ * CSD_TYPE_TTWU:
+ * struct {
+ * struct llist_node node;
+ * unsigned int flags;
+ * };
+ *
+ */
+
+struct __call_single_node {
+ struct llist_node llist;
+ union {
+ unsigned int u_flags;
+ atomic_t a_flags;
+ };
+};
+
+#endif /* __LINUX_SMP_TYPES_H */
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index d3770b3f9d9a..f2f12d746dbd 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -56,6 +56,7 @@
#include <linux/kernel.h>
#include <linux/stringify.h>
#include <linux/bottom_half.h>
+#include <linux/lockdep.h>
#include <asm/barrier.h>
#include <asm/mmiowb.h>
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 6102e6bff3ae..b981caafe8bf 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -15,7 +15,7 @@
# include <linux/spinlock_types_up.h>
#endif
-#include <linux/lockdep.h>
+#include <linux/lockdep_types.h>
typedef struct raw_spinlock {
arch_spinlock_t raw_lock;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7c354c2955f5..b951a87da987 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1360,7 +1360,7 @@ static inline long ksys_lchown(const char __user *filename, uid_t user,
extern long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
-static inline long ksys_ftruncate(unsigned int fd, unsigned long length)
+static inline long ksys_ftruncate(unsigned int fd, loff_t length)
{
return do_sys_ftruncate(fd, length, 1);
}
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index b27e2ffa96c1..d5471d6fa778 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -222,9 +222,9 @@ extern bool timekeeping_rtc_skipresume(void);
extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta);
-/*
+/**
* struct system_time_snapshot - simultaneous raw/real time capture with
- * counter value
+ * counter value
* @cycles: Clocksource counter value to produce the system times
* @real: Realtime system time
* @raw: Monotonic raw system time
@@ -239,9 +239,9 @@ struct system_time_snapshot {
u8 cs_was_changed_seq;
};
-/*
+/**
* struct system_device_crosststamp - system/device cross-timestamp
- * (syncronized capture)
+ * (synchronized capture)
* @device: Device time
* @sys_realtime: Realtime simultaneous with device time
* @sys_monoraw: Monotonic raw simultaneous with device time
@@ -252,12 +252,12 @@ struct system_device_crosststamp {
ktime_t sys_monoraw;
};
-/*
+/**
* struct system_counterval_t - system counter value with the pointer to the
- * corresponding clocksource
+ * corresponding clocksource
* @cycles: System counter value
* @cs: Clocksource corresponding to system counter value. Used by
- * timekeeping code to verify comparibility of two cycle values
+ * timekeeping code to verify comparibility of two cycle values
*/
struct system_counterval_t {
u64 cycles;
diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
index 4f8c90c93c29..64356b199e94 100644
--- a/include/linux/tpm_eventlog.h
+++ b/include/linux/tpm_eventlog.h
@@ -81,6 +81,8 @@ struct tcg_efi_specid_event_algs {
u16 digest_size;
} __packed;
+#define TCG_SPECID_SIG "Spec ID Event03"
+
struct tcg_efi_specid_event_head {
u8 signature[16];
u32 platform_class;
@@ -171,6 +173,7 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
int i;
int j;
u32 count, event_type;
+ const u8 zero_digest[sizeof(event_header->digest)] = {0};
marker = event;
marker_start = marker;
@@ -198,10 +201,19 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
count = READ_ONCE(event->count);
event_type = READ_ONCE(event->event_type);
+ /* Verify that it's the log header */
+ if (event_header->pcr_idx != 0 ||
+ event_header->event_type != NO_ACTION ||
+ memcmp(event_header->digest, zero_digest, sizeof(zero_digest))) {
+ size = 0;
+ goto out;
+ }
+
efispecid = (struct tcg_efi_specid_event_head *)event_header->event;
/* Check if event is malformed. */
- if (count > efispecid->num_algs) {
+ if (memcmp(efispecid->signature, TCG_SPECID_SIG,
+ sizeof(TCG_SPECID_SIG)) || count > efispecid->num_algs) {
size = 0;
goto out;
}
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ed168b0e2c53..04f9a4c7b0d9 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -634,6 +634,14 @@ DECLARE_TRACE(sched_overutilized_tp,
TP_PROTO(struct root_domain *rd, bool overutilized),
TP_ARGS(rd, overutilized));
+DECLARE_TRACE(sched_util_est_cfs_tp,
+ TP_PROTO(struct cfs_rq *cfs_rq),
+ TP_ARGS(cfs_rq));
+
+DECLARE_TRACE(sched_util_est_se_tp,
+ TP_PROTO(struct sched_entity *se),
+ TP_ARGS(se));
+
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 7b2d6fc9e6ed..52ca2093831c 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -383,7 +383,8 @@ struct perf_event_attr {
bpf_event : 1, /* include bpf events */
aux_output : 1, /* generate AUX records instead of events */
cgroup : 1, /* include cgroup events */
- __reserved_1 : 31;
+ text_poke : 1, /* include text poke events */
+ __reserved_1 : 30;
union {
__u32 wakeup_events; /* wakeup every n events */
@@ -1024,12 +1025,35 @@ enum perf_event_type {
*/
PERF_RECORD_CGROUP = 19,
+ /*
+ * Records changes to kernel text i.e. self-modified code. 'old_len' is
+ * the number of old bytes, 'new_len' is the number of new bytes. Either
+ * 'old_len' or 'new_len' may be zero to indicate, for example, the
+ * addition or removal of a trampoline. 'bytes' contains the old bytes
+ * followed immediately by the new bytes.
+ *
+ * struct {
+ * struct perf_event_header header;
+ * u64 addr;
+ * u16 old_len;
+ * u16 new_len;
+ * u8 bytes[];
+ * struct sample_id sample_id;
+ * };
+ */
+ PERF_RECORD_TEXT_POKE = 20,
+
PERF_RECORD_MAX, /* non-ABI */
};
enum perf_record_ksymbol_type {
PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0,
PERF_RECORD_KSYMBOL_TYPE_BPF = 1,
+ /*
+ * Out of line code such as kprobe-replaced instructions or optimized
+ * kprobes or ftrace trampolines.
+ */
+ PERF_RECORD_KSYMBOL_TYPE_OOL = 2,
PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */
};
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index d72beda824aa..53314d7da4be 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -45,11 +45,6 @@ static int __init early_initrdmem(char *p)
}
early_param("initrdmem", early_initrdmem);
-/*
- * This is here as the initrd keyword has been in use since 11/2018
- * on ARM, PowerPC, and MIPS.
- * It should not be; it is reserved for bootloaders.
- */
static int __init early_initrd(char *p)
{
return early_initrdmem(p);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 856d98c36f56..9b8f92500833 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
+static atomic_t nr_text_poke_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -4575,7 +4576,7 @@ static bool is_sb_event(struct perf_event *event)
if (attr->mmap || attr->mmap_data || attr->mmap2 ||
attr->comm || attr->comm_exec ||
attr->task || attr->ksymbol ||
- attr->context_switch ||
+ attr->context_switch || attr->text_poke ||
attr->bpf_event)
return true;
return false;
@@ -4651,6 +4652,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_ksymbol_events);
if (event->attr.bpf_event)
atomic_dec(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_dec(&nr_text_poke_events);
if (dec) {
if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -8628,6 +8631,89 @@ void perf_event_bpf_event(struct bpf_prog *prog,
perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}
+struct perf_text_poke_event {
+ const void *old_bytes;
+ const void *new_bytes;
+ size_t pad;
+ u16 old_len;
+ u16 new_len;
+
+ struct {
+ struct perf_event_header header;
+
+ u64 addr;
+ } event_id;
+};
+
+static int perf_event_text_poke_match(struct perf_event *event)
+{
+ return event->attr.text_poke;
+}
+
+static void perf_event_text_poke_output(struct perf_event *event, void *data)
+{
+ struct perf_text_poke_event *text_poke_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u64 padding = 0;
+ int ret;
+
+ if (!perf_event_text_poke_match(event))
+ return;
+
+ perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size);
+ if (ret)
+ return;
+
+ perf_output_put(&handle, text_poke_event->event_id);
+ perf_output_put(&handle, text_poke_event->old_len);
+ perf_output_put(&handle, text_poke_event->new_len);
+
+ __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
+ __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
+
+ if (text_poke_event->pad)
+ __output_copy(&handle, &padding, text_poke_event->pad);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+void perf_event_text_poke(const void *addr, const void *old_bytes,
+ size_t old_len, const void *new_bytes, size_t new_len)
+{
+ struct perf_text_poke_event text_poke_event;
+ size_t tot, pad;
+
+ if (!atomic_read(&nr_text_poke_events))
+ return;
+
+ tot = sizeof(text_poke_event.old_len) + old_len;
+ tot += sizeof(text_poke_event.new_len) + new_len;
+ pad = ALIGN(tot, sizeof(u64)) - tot;
+
+ text_poke_event = (struct perf_text_poke_event){
+ .old_bytes = old_bytes,
+ .new_bytes = new_bytes,
+ .pad = pad,
+ .old_len = old_len,
+ .new_len = new_len,
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_TEXT_POKE,
+ .misc = PERF_RECORD_MISC_KERNEL,
+ .size = sizeof(text_poke_event.event_id) + tot + pad,
+ },
+ .addr = (unsigned long)addr,
+ },
+ };
+
+ perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
+}
+
void perf_event_itrace_started(struct perf_event *event)
{
event->attach_state |= PERF_ATTACH_ITRACE;
@@ -10945,6 +11031,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_ksymbol_events);
if (event->attr.bpf_event)
atomic_inc(&nr_bpf_events);
+ if (event->attr.text_poke)
+ atomic_inc(&nr_text_poke_events);
if (inc) {
/*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 761911168438..06b62746e1df 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1271,9 +1271,6 @@ static int
setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
{
struct task_struct *t;
- struct sched_param param = {
- .sched_priority = MAX_USER_RT_PRIO/2,
- };
if (!secondary) {
t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -1281,13 +1278,12 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
} else {
t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq,
new->name);
- param.sched_priority -= 1;
}
if (IS_ERR(t))
return PTR_ERR(t);
- sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
+ sched_set_fifo(t);
/*
* We keep the reference to the task struct even if
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 16c8c605f4b0..834bfdc43235 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
#include <linux/slab.h>
#include <linux/filter.h>
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include <linux/compiler.h>
/*
@@ -437,6 +438,7 @@ struct kallsym_iter {
loff_t pos_arch_end;
loff_t pos_mod_end;
loff_t pos_ftrace_mod_end;
+ loff_t pos_bpf_end;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols. */
char type;
@@ -480,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
return 1;
}
+/*
+ * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace
+ * purposes. In that case "__builtin__ftrace" is used as a module name, even
+ * though "__builtin__ftrace" is not a module.
+ */
static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
{
int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
@@ -496,11 +503,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
static int get_ksymbol_bpf(struct kallsym_iter *iter)
{
+ int ret;
+
strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);
iter->exported = 0;
- return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
- &iter->value, &iter->type,
- iter->name) < 0 ? 0 : 1;
+ ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
+ &iter->value, &iter->type,
+ iter->name);
+ if (ret < 0) {
+ iter->pos_bpf_end = iter->pos;
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * This uses "__builtin__kprobes" as a module name for symbols for pages
+ * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a
+ * module.
+ */
+static int get_ksymbol_kprobe(struct kallsym_iter *iter)
+{
+ strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN);
+ iter->exported = 0;
+ return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end,
+ &iter->value, &iter->type,
+ iter->name) < 0 ? 0 : 1;
}
/* Returns space to next name. */
@@ -527,6 +556,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
iter->pos_arch_end = 0;
iter->pos_mod_end = 0;
iter->pos_ftrace_mod_end = 0;
+ iter->pos_bpf_end = 0;
}
}
@@ -551,7 +581,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
get_ksymbol_ftrace_mod(iter))
return 1;
- return get_ksymbol_bpf(iter);
+ if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) &&
+ get_ksymbol_bpf(iter))
+ return 1;
+
+ return get_ksymbol_kprobe(iter);
}
/* Returns false if pos at or past end of file. */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4a904cc56d68..0924397d9d59 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/jump_label.h>
+#include <linux/perf_event.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
@@ -123,6 +124,7 @@ struct kprobe_insn_cache kprobe_insn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
.alloc = alloc_insn_page,
.free = free_insn_page,
+ .sym = KPROBE_INSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
.insn_size = MAX_INSN_SIZE,
.nr_garbage = 0,
@@ -188,6 +190,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
kip->cache = c;
list_add_rcu(&kip->list, &c->pages);
slot = kip->insns;
+
+ /* Record the perf ksymbol register event after adding the page */
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
+ PAGE_SIZE, false, c->sym);
out:
mutex_unlock(&c->mutex);
return slot;
@@ -206,6 +212,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
* next time somebody inserts a probe.
*/
if (!list_is_singular(&kip->list)) {
+ /*
+ * Record perf ksymbol unregister event before removing
+ * the page.
+ */
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ (unsigned long)kip->insns, PAGE_SIZE, true,
+ kip->cache->sym);
list_del_rcu(&kip->list);
synchronize_rcu();
kip->cache->free(kip->insns);
@@ -295,12 +308,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
return ret;
}
+int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+ unsigned long *value, char *type, char *sym)
+{
+ struct kprobe_insn_page *kip;
+ int ret = -ERANGE;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ if ((*symnum)--)
+ continue;
+ strlcpy(sym, c->sym, KSYM_NAME_LEN);
+ *type = 't';
+ *value = (unsigned long)kip->insns;
+ ret = 0;
+ break;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
#ifdef CONFIG_OPTPROBES
/* For optimized_kprobe buffer */
struct kprobe_insn_cache kprobe_optinsn_slots = {
.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
.alloc = alloc_insn_page,
.free = free_insn_page,
+ .sym = KPROBE_OPTINSN_PAGE_SYM,
.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
/* .insn_size is initialized later */
.nr_garbage = 0,
@@ -2232,6 +2267,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
kprobe_remove_area_blacklist(entry, entry + 1);
}
+int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+ char *type, char *sym)
+{
+ return -ERANGE;
+}
+
+int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *sym)
+{
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
+ if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
+ return 0;
+#ifdef CONFIG_OPTPROBES
+ if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
+ return 0;
+#endif
+#endif
+ if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
+ return 0;
+ return -ERANGE;
+}
+
int __init __weak arch_populate_kprobe_blacklist(void)
{
return 0;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 132f84a5fde3..1d9e2fdfd67a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
+#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
@@ -383,7 +384,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
* The kernel thread should not inherit these properties.
*/
sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(task, cpu_all_mask);
+ set_cpus_allowed_ptr(task,
+ housekeeping_cpumask(HK_FLAG_KTHREAD));
}
kfree(create);
return task;
@@ -608,7 +610,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, cpu_all_mask);
+ set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 5efbfc68ce99..fca41a7ded0c 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -436,8 +436,6 @@ static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
static void torture_rtmutex_boost(struct torture_random_state *trsp)
{
- int policy;
- struct sched_param param;
const unsigned int factor = 50000; /* yes, quite arbitrary */
if (!rt_task(current)) {
@@ -448,8 +446,7 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
*/
if (trsp && !(torture_random(trsp) %
(cxt.nrealwriters_stress * factor))) {
- policy = SCHED_FIFO;
- param.sched_priority = MAX_RT_PRIO - 1;
+ sched_set_fifo(current);
} else /* common case, do nothing */
return;
} else {
@@ -462,13 +459,10 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
*/
if (!trsp || !(torture_random(trsp) %
(cxt.nrealwriters_stress * factor * 2))) {
- policy = SCHED_NORMAL;
- param.sched_priority = 0;
+ sched_set_normal(current, 0);
} else /* common case, do nothing */
return;
}
-
- sched_setscheduler_nocheck(current, policy, &param);
}
static void torture_rtmutex_delay(struct torture_random_state *trsp)
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 16dd1e6b7c09..0d4f475a7b2e 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -354,7 +354,6 @@ rcu_perf_writer(void *arg)
int i_max;
long me = (long)arg;
struct rcu_head *rhp = NULL;
- struct sched_param sp;
bool started = false, done = false, alldone = false;
u64 t;
u64 *wdp;
@@ -363,8 +362,7 @@ rcu_perf_writer(void *arg)
VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
WARN_ON(!wdpp);
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
- sp.sched_priority = 1;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+ sched_set_fifo_low(current);
if (holdoff)
schedule_timeout_uninterruptible(holdoff * HZ);
@@ -420,9 +418,7 @@ retry:
started = true;
if (!done && i >= MIN_MEAS) {
done = true;
- sp.sched_priority = 0;
- sched_setscheduler_nocheck(current,
- SCHED_NORMAL, &sp);
+ sched_set_normal(current, 0);
pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n",
perf_type, PERF_FLAG, me, MIN_MEAS);
if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
@@ -723,7 +719,7 @@ kfree_perf_init(void)
schedule_timeout_uninterruptible(1);
}
- pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj));
+ pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj));
kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
GFP_KERNEL);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efb792e13fca..b4c1146de414 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -889,16 +889,11 @@ static int rcu_torture_boost(void *arg)
unsigned long endtime;
unsigned long oldstarttime;
struct rcu_boost_inflight rbi = { .inflight = 0 };
- struct sched_param sp;
VERBOSE_TOROUT_STRING("rcu_torture_boost started");
/* Set real-time priority. */
- sp.sched_priority = 1;
- if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
- VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
- n_rcu_torture_boost_rterror++;
- }
+ sched_set_fifo_low(current);
init_rcu_head_on_stack(&rbi.rcu);
/* Each pass through the following loop does one boost-test cycle. */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8f360326861e..53c482da7527 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -36,6 +36,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -1637,7 +1639,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
- if (cpumask_equal(p->cpus_ptr, new_mask))
+ if (cpumask_equal(&p->cpus_mask, new_mask))
goto out;
/*
@@ -2293,8 +2295,15 @@ void sched_ttwu_pending(void *arg)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- llist_for_each_entry_safe(p, t, llist, wake_entry)
+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
+ if (WARN_ON_ONCE(p->on_cpu))
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
+ set_task_cpu(p, cpu_of(rq));
+
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
+ }
rq_unlock_irqrestore(rq, &rf);
}
@@ -2322,7 +2331,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
WRITE_ONCE(rq->ttwu_pending, 1);
- __smp_call_single_queue(cpu, &p->wake_entry);
+ __smp_call_single_queue(cpu, &p->wake_entry.llist);
}
void wake_up_if_idle(int cpu)
@@ -2369,7 +2378,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
* the soon-to-be-idle CPU as the current CPU is likely busy.
* nr_running is checked to avoid unnecessary task stacking.
*/
- if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1)
+ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
return true;
return false;
@@ -2378,6 +2387,9 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+ if (WARN_ON_ONCE(cpu == smp_processor_id()))
+ return false;
+
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
@@ -2528,7 +2540,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
goto out;
success = 1;
- cpu = task_cpu(p);
trace_sched_waking(p);
p->state = TASK_RUNNING;
trace_sched_wakeup(p);
@@ -2550,7 +2561,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
/* We're going to change ->state: */
success = 1;
- cpu = task_cpu(p);
/*
* Ensure we load p->on_rq _after_ p->state, otherwise it would
@@ -2614,8 +2624,21 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* which potentially sends an IPI instead of spinning on p->on_cpu to
* let the waker make forward progress. This is safe because IRQs are
* disabled and the IPI will deliver after on_cpu is cleared.
+ *
+ * Ensure we load task_cpu(p) after p->on_cpu:
+ *
+ * set_task_cpu(p, cpu);
+ * STORE p->cpu = @cpu
+ * __schedule() (switch to task 'p')
+ * LOCK rq->lock
+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
+ * STORE p->on_cpu = 1 LOAD p->cpu
+ *
+ * to ensure we observe the correct CPU on which the task is currently
+ * scheduling.
*/
- if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
+ if (smp_load_acquire(&p->on_cpu) &&
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
goto unlock;
/*
@@ -2635,6 +2658,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
+#else
+ cpu = task_cpu(p);
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
@@ -2642,7 +2667,7 @@ unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out:
if (success)
- ttwu_stat(p, cpu, wake_flags);
+ ttwu_stat(p, task_cpu(p), wake_flags);
preempt_enable();
return success;
@@ -2763,7 +2788,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP
- p->wake_entry_type = CSD_TYPE_TTWU;
+ p->wake_entry.u_flags = CSD_TYPE_TTWU;
#endif
}
@@ -4533,7 +4558,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
*/
if (dl_prio(prio)) {
if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
queue_flag |= ENQUEUE_REPLENISH;
} else
@@ -5122,6 +5148,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
* @policy: new policy.
* @param: structure containing the new RT priority.
*
+ * Use sched_set_fifo(), read its comment.
+ *
* Return: 0 on success. An error code otherwise.
*
* NOTE that the task may be already dead.
@@ -5164,6 +5192,51 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
}
EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+/*
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
+ * incapable of resource management, which is the one thing an OS really should
+ * be doing.
+ *
+ * This is of course the reason it is limited to privileged users only.
+ *
+ * Worse still; it is fundamentally impossible to compose static priority
+ * workloads. You cannot take two correctly working static prio workloads
+ * and smash them together and still expect them to work.
+ *
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
+ *
+ * MAX_RT_PRIO / 2
+ *
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
+ * know enough information to make a sensible choice.
+ */
+void sched_set_fifo(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo);
+
+/*
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
+ */
+void sched_set_fifo_low(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+
+void sched_set_normal(struct task_struct *p, int nice)
+{
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = nice,
+ };
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_normal);
+
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
@@ -5810,7 +5883,7 @@ again:
if (task_running(p_rq, p) || p->state)
goto out_unlock;
- yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ yielded = curr->sched_class->yield_to_task(rq, p);
if (yielded) {
schedstat_inc(rq->yld_count);
/*
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5cc4012572ec..8cb06c8c7eb1 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -121,6 +121,30 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
+ unsigned long cap, max_cap = 0;
+ int cpu, max_cpu = -1;
+
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return 1;
+
+ /* Ensure the capacity of the CPUs fits the task. */
+ for_each_cpu(cpu, later_mask) {
+ if (!dl_task_fits_capacity(p, cpu)) {
+ cpumask_clear_cpu(cpu, later_mask);
+
+ cap = capacity_orig_of(cpu);
+
+ if (cap > max_cap ||
+ (cpu == task_cpu(p) && cap == max_cap)) {
+ max_cap = cap;
+ max_cpu = cpu;
+ }
+ }
+ }
+
+ if (cpumask_empty(later_mask))
+ cpumask_set_cpu(max_cpu, later_mask);
+
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ff9435dee1df..5a55d2300452 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -520,50 +520,6 @@ void account_idle_ticks(unsigned long ticks)
}
/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static u64 scale_stime(u64 stime, u64 rtime, u64 total)
-{
- u64 scaled;
-
- for (;;) {
- /* Make sure "rtime" is the bigger of stime/rtime */
- if (stime > rtime)
- swap(rtime, stime);
-
- /* Make sure 'total' fits in 32 bits */
- if (total >> 32)
- goto drop_precision;
-
- /* Does rtime (and thus stime) fit in 32 bits? */
- if (!(rtime >> 32))
- break;
-
- /* Can we just balance rtime/stime rather than dropping bits? */
- if (stime >> 31)
- goto drop_precision;
-
- /* We can grow stime and shrink rtime and try to make them both fit */
- stime <<= 1;
- rtime >>= 1;
- continue;
-
-drop_precision:
- /* We drop from rtime, it has more bits than stime */
- rtime >>= 1;
- total >>= 1;
- }
-
- /*
- * Make sure gcc understands that this is a 32x32->64 multiply,
- * followed by a 64/32->64 divide.
- */
- scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return scaled;
-}
-
-/*
* Adjust tick based cputime random precision against scheduler runtime
* accounting.
*
@@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
goto update;
}
- stime = scale_stime(stime, rtime, stime + utime);
+ stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
update:
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 504d2f51b0d6..2e1483f0a298 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -54,15 +54,49 @@ static inline struct dl_bw *dl_bw_of(int i)
static inline int dl_bw_cpus(int i)
{
struct root_domain *rd = cpu_rq(i)->rd;
- int cpus = 0;
+ int cpus;
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
"sched RCU must be held");
+
+ if (cpumask_subset(rd->span, cpu_active_mask))
+ return cpumask_weight(rd->span);
+
+ cpus = 0;
+
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
return cpus;
}
+
+static inline unsigned long __dl_bw_capacity(int i)
+{
+ struct root_domain *rd = cpu_rq(i)->rd;
+ unsigned long cap = 0;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+
+ for_each_cpu_and(i, rd->span, cpu_active_mask)
+ cap += capacity_orig_of(i);
+
+ return cap;
+}
+
+/*
+ * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity
+ * of the CPU the task is running on rather rd's \Sum CPU capacity.
+ */
+static inline unsigned long dl_bw_capacity(int i)
+{
+ if (!static_branch_unlikely(&sched_asym_cpucapacity) &&
+ capacity_orig_of(i) == SCHED_CAPACITY_SCALE) {
+ return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT;
+ } else {
+ return __dl_bw_capacity(i);
+ }
+}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -73,6 +107,11 @@ static inline int dl_bw_cpus(int i)
{
return 1;
}
+
+static inline unsigned long dl_bw_capacity(int i)
+{
+ return SCHED_CAPACITY_SCALE;
+}
#endif
static inline
@@ -1098,7 +1137,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
* cannot use the runtime, and so it replenishes the task. This rule
* works fine for implicit deadline tasks (deadline == period), and the
* CBS was designed for implicit deadline tasks. However, a task with
- * constrained deadline (deadine < period) might be awakened after the
+ * constrained deadline (deadline < period) might be awakened after the
* deadline, but before the next period. In this case, replenishing the
* task would allow it to run for runtime / deadline. As in this case
* deadline < period, CBS enables a task to run for more than the
@@ -1604,6 +1643,7 @@ static int
select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
+ bool select_rq;
struct rq *rq;
if (sd_flag != SD_BALANCE_WAKE)
@@ -1623,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
* other hand, if it has a shorter deadline, we
* try to make it stay here, it might be important.
*/
- if (unlikely(dl_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &curr->dl)) &&
- (p->nr_cpus_allowed > 1)) {
+ select_rq = unlikely(dl_task(curr)) &&
+ (curr->nr_cpus_allowed < 2 ||
+ !dl_entity_preempt(&p->dl, &curr->dl)) &&
+ p->nr_cpus_allowed > 1;
+
+ /*
+ * Take the capacity of the CPU into account to
+ * ensure it fits the requirement of the task.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
+ select_rq |= !dl_task_fits_capacity(p, cpu);
+
+ if (select_rq) {
int target = find_later_rq(p);
if (target != -1 &&
@@ -2551,11 +2600,12 @@ void sched_dl_do_global(void)
int sched_dl_overflow(struct task_struct *p, int policy,
const struct sched_attr *attr)
{
- struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
u64 period = attr->sched_period ?: attr->sched_deadline;
u64 runtime = attr->sched_runtime;
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
- int cpus, err = -1;
+ int cpus, err = -1, cpu = task_cpu(p);
+ struct dl_bw *dl_b = dl_bw_of(cpu);
+ unsigned long cap;
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return 0;
@@ -2570,15 +2620,17 @@ int sched_dl_overflow(struct task_struct *p, int policy,
* allocated bandwidth of the container.
*/
raw_spin_lock(&dl_b->lock);
- cpus = dl_bw_cpus(task_cpu(p));
+ cpus = dl_bw_cpus(cpu);
+ cap = dl_bw_capacity(cpu);
+
if (dl_policy(policy) && !task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+ !__dl_overflow(dl_b, cap, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
- !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+ !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) {
/*
* XXX this is slightly incorrect: when the task
* utilization decreases, we should delay the total
@@ -2635,6 +2687,14 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
}
/*
+ * Default limits for DL period; on the top end we guard against small util
+ * tasks still getting rediculous long effective runtimes, on the bottom end we
+ * guard against timer DoS.
+ */
+unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
+unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
+
+/*
* This function validates the new parameters of a -deadline task.
* We ask for the deadline not being zero, and greater or equal
* than the runtime, as well as the period of being zero or
@@ -2646,6 +2706,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
*/
bool __checkparam_dl(const struct sched_attr *attr)
{
+ u64 period, max, min;
+
/* special dl tasks don't actually use any parameter */
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return true;
@@ -2669,12 +2731,21 @@ bool __checkparam_dl(const struct sched_attr *attr)
attr->sched_period & (1ULL << 63))
return false;
+ period = attr->sched_period;
+ if (!period)
+ period = attr->sched_deadline;
+
/* runtime <= deadline <= period (if period != 0) */
- if ((attr->sched_period != 0 &&
- attr->sched_period < attr->sched_deadline) ||
+ if (period < attr->sched_deadline ||
attr->sched_deadline < attr->sched_runtime)
return false;
+ max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC;
+ min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC;
+
+ if (period < min || period > max)
+ return false;
+
return true;
}
@@ -2692,6 +2763,7 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
+ dl_se->dl_boosted = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
@@ -2714,19 +2786,19 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
#ifdef CONFIG_SMP
int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
{
+ unsigned long flags, cap;
unsigned int dest_cpu;
struct dl_bw *dl_b;
bool overflow;
- int cpus, ret;
- unsigned long flags;
+ int ret;
dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
rcu_read_lock_sched();
dl_b = dl_bw_of(dest_cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(dest_cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ cap = dl_bw_capacity(dest_cpu);
+ overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw);
if (overflow) {
ret = -EBUSY;
} else {
@@ -2736,6 +2808,8 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
* We will free resources in the source root_domain
* later on (see set_cpus_allowed_dl()).
*/
+ int cpus = dl_bw_cpus(dest_cpu);
+
__dl_add(dl_b, p->dl.dl_bw, cpus);
ret = 0;
}
@@ -2768,16 +2842,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
bool dl_cpu_busy(unsigned int cpu)
{
- unsigned long flags;
+ unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow;
- int cpus;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cpus = dl_bw_cpus(cpu);
- overflow = __dl_overflow(dl_b, cpus, 0, 0);
+ cap = dl_bw_capacity(cpu);
+ overflow = __dl_overflow(dl_b, cap, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cbcb2f71599b..0424a0af5f87 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3094,7 +3094,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
#ifdef CONFIG_SMP
do {
- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+ u32 divider = get_pelt_divider(&se->avg);
se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
} while (0);
@@ -3440,16 +3440,18 @@ static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
- /*
- * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
- * See ___update_load_avg() for details.
- */
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ u32 divider;
/* Nothing to update */
if (!delta)
return;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
se->avg.util_sum = se->avg.util_avg * divider;
@@ -3463,16 +3465,18 @@ static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
- /*
- * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
- * See ___update_load_avg() for details.
- */
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ u32 divider;
/* Nothing to update */
if (!delta)
return;
+ /*
+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+ * See ___update_load_avg() for details.
+ */
+ divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
se->avg.runnable_sum = se->avg.runnable_avg * divider;
@@ -3500,7 +3504,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ divider = get_pelt_divider(&cfs_rq->avg);
if (runnable_sum >= 0) {
/*
@@ -3646,7 +3650,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
if (cfs_rq->removed.nr) {
unsigned long r;
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
@@ -3701,7 +3705,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
* See ___update_load_avg() for details.
*/
- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+ u32 divider = get_pelt_divider(&cfs_rq->avg);
/*
* When we attach the @se to the @cfs_rq, we must align the decay
@@ -3922,6 +3926,8 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
enqueued = cfs_rq->avg.util_est.enqueued;
enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+ trace_sched_util_est_cfs_tp(cfs_rq);
}
/*
@@ -3952,6 +3958,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+ trace_sched_util_est_cfs_tp(cfs_rq);
+
/*
* Skip update of task's estimated utilization when the task has not
* yet completed an activation, e.g. being migrated.
@@ -4017,6 +4025,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
done:
WRITE_ONCE(p->se.avg.util_est, ue);
+
+ trace_sched_util_est_se_tp(&p->se);
}
static inline int task_fits_capacity(struct task_struct *p, long capacity)
@@ -7157,7 +7167,7 @@ static void yield_task_fair(struct rq *rq)
set_skip_buddy(se);
}
-static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -8038,7 +8048,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
};
}
-static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long max = arch_scale_cpu_capacity(cpu);
@@ -8070,7 +8080,7 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = scale_rt_capacity(sd, cpu);
+ unsigned long capacity = scale_rt_capacity(cpu);
struct sched_group *sdg = sd->groups;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
@@ -10016,7 +10026,12 @@ static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
- nohz.next_balance++;
+ /*
+ * Increase nohz.next_balance only when if full ilb is triggered but
+ * not if we only update stats.
+ */
+ if (flags & NOHZ_BALANCE_KICK)
+ nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
@@ -10337,6 +10352,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
}
}
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the CPU is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ nohz.next_balance = next_balance;
+
/* Newly idle CPU doesn't need an update */
if (idle != CPU_NEWLY_IDLE) {
update_blocked_averages(this_cpu);
@@ -10357,14 +10380,6 @@ abort:
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
- /*
- * next_balance will be updated only when there is a need.
- * When the CPU is attached to null domain for ex, it will not be
- * updated.
- */
- if (likely(update_next_balance))
- nohz.next_balance = next_balance;
-
return ret;
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 05deb81bb3e3..8d75ca201484 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -446,11 +446,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
BUG();
}
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_idle(struct rq *rq)
{
}
@@ -479,8 +474,6 @@ const struct sched_class idle_sched_class = {
.task_tick = task_tick_idle,
- .get_rr_interval = get_rr_interval_idle,
-
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
.update_curr = update_curr_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 808244f3ddd9..5a6ea03f9882 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -140,7 +140,8 @@ static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned int flags;
- flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+ flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
+ HK_FLAG_MISC | HK_FLAG_KTHREAD;
return housekeeping_setup(str, flags);
}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index b4b1ff96642f..11bea3b08115 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -83,8 +83,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
return c1 + c2 + c3;
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
/*
* Accumulate the three separate parts of the sum; d1 the remainder
* of the last (incomplete) period, d2 the span of full periods and d3
@@ -264,7 +262,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
static __always_inline void
___update_load_avg(struct sched_avg *sa, unsigned long load)
{
- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+ u32 divider = get_pelt_divider(sa);
/*
* Step 2: update *_avg.
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index eb034d9f024d..795e43e02afc 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -37,6 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running)
}
#endif
+static inline u32 get_pelt_divider(struct sched_avg *avg)
+{
+ return LOAD_AVG_MAX - 1024 + avg->period_contrib;
+}
+
/*
* When a task is dequeued, its estimated utilization should not be update if
* its util_avg has not been updated at least once.
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 8f45cdb6463b..967732c0766c 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -190,7 +190,6 @@ static void group_init(struct psi_group *group)
INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
/* Init trigger-related members */
- atomic_set(&group->poll_scheduled, 0);
mutex_init(&group->trigger_lock);
INIT_LIST_HEAD(&group->triggers);
memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
@@ -199,7 +198,7 @@ static void group_init(struct psi_group *group)
memset(group->polling_total, 0, sizeof(group->polling_total));
group->polling_next_update = ULLONG_MAX;
group->polling_until = 0;
- rcu_assign_pointer(group->poll_kworker, NULL);
+ rcu_assign_pointer(group->poll_task, NULL);
}
void __init psi_init(void)
@@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now)
return now + group->poll_min_period;
}
-/*
- * Schedule polling if it's not already scheduled. It's safe to call even from
- * hotpath because even though kthread_queue_delayed_work takes worker->lock
- * spinlock that spinlock is never contended due to poll_scheduled atomic
- * preventing such competition.
- */
+/* Schedule polling if it's not already scheduled. */
static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
{
- struct kthread_worker *kworker;
+ struct task_struct *task;
- /* Do not reschedule if already scheduled */
- if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+ /*
+ * Do not reschedule if already scheduled.
+ * Possible race with a timer scheduled after this check but before
+ * mod_timer below can be tolerated because group->polling_next_update
+ * will keep updates on schedule.
+ */
+ if (timer_pending(&group->poll_timer))
return;
rcu_read_lock();
- kworker = rcu_dereference(group->poll_kworker);
+ task = rcu_dereference(group->poll_task);
/*
* kworker might be NULL in case psi_trigger_destroy races with
* psi_task_change (hotpath) which can't use locks
*/
- if (likely(kworker))
- kthread_queue_delayed_work(kworker, &group->poll_work, delay);
- else
- atomic_set(&group->poll_scheduled, 0);
+ if (likely(task))
+ mod_timer(&group->poll_timer, jiffies + delay);
rcu_read_unlock();
}
-static void psi_poll_work(struct kthread_work *work)
+static void psi_poll_work(struct psi_group *group)
{
- struct kthread_delayed_work *dwork;
- struct psi_group *group;
u32 changed_states;
u64 now;
- dwork = container_of(work, struct kthread_delayed_work, work);
- group = container_of(dwork, struct psi_group, poll_work);
-
- atomic_set(&group->poll_scheduled, 0);
-
mutex_lock(&group->trigger_lock);
now = sched_clock();
@@ -623,6 +613,32 @@ out:
mutex_unlock(&group->trigger_lock);
}
+static int psi_poll_worker(void *data)
+{
+ struct psi_group *group = (struct psi_group *)data;
+
+ sched_set_fifo_low(current);
+
+ while (true) {
+ wait_event_interruptible(group->poll_wait,
+ atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
+ kthread_should_stop());
+ if (kthread_should_stop())
+ break;
+
+ psi_poll_work(group);
+ }
+ return 0;
+}
+
+static void poll_timer_fn(struct timer_list *t)
+{
+ struct psi_group *group = from_timer(group, t, poll_timer);
+
+ atomic_set(&group->poll_wakeup, 1);
+ wake_up_interruptible(&group->poll_wait);
+}
+
static void record_times(struct psi_group_cpu *groupc, int cpu,
bool memstall_tick)
{
@@ -1099,22 +1115,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
mutex_lock(&group->trigger_lock);
- if (!rcu_access_pointer(group->poll_kworker)) {
- struct sched_param param = {
- .sched_priority = 1,
- };
- struct kthread_worker *kworker;
+ if (!rcu_access_pointer(group->poll_task)) {
+ struct task_struct *task;
- kworker = kthread_create_worker(0, "psimon");
- if (IS_ERR(kworker)) {
+ task = kthread_create(psi_poll_worker, group, "psimon");
+ if (IS_ERR(task)) {
kfree(t);
mutex_unlock(&group->trigger_lock);
- return ERR_CAST(kworker);
+ return ERR_CAST(task);
}
- sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
- kthread_init_delayed_work(&group->poll_work,
- psi_poll_work);
- rcu_assign_pointer(group->poll_kworker, kworker);
+ atomic_set(&group->poll_wakeup, 0);
+ init_waitqueue_head(&group->poll_wait);
+ wake_up_process(task);
+ timer_setup(&group->poll_timer, poll_timer_fn, 0);
+ rcu_assign_pointer(group->poll_task, task);
}
list_add(&t->node, &group->triggers);
@@ -1132,7 +1146,7 @@ static void psi_trigger_destroy(struct kref *ref)
{
struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
struct psi_group *group = t->group;
- struct kthread_worker *kworker_to_destroy = NULL;
+ struct task_struct *task_to_destroy = NULL;
if (static_branch_likely(&psi_disabled))
return;
@@ -1158,13 +1172,13 @@ static void psi_trigger_destroy(struct kref *ref)
period = min(period, div_u64(tmp->win.size,
UPDATES_PER_WINDOW));
group->poll_min_period = period;
- /* Destroy poll_kworker when the last trigger is destroyed */
+ /* Destroy poll_task when the last trigger is destroyed */
if (group->poll_states == 0) {
group->polling_until = 0;
- kworker_to_destroy = rcu_dereference_protected(
- group->poll_kworker,
+ task_to_destroy = rcu_dereference_protected(
+ group->poll_task,
lockdep_is_held(&group->trigger_lock));
- rcu_assign_pointer(group->poll_kworker, NULL);
+ rcu_assign_pointer(group->poll_task, NULL);
}
}
@@ -1172,25 +1186,23 @@ static void psi_trigger_destroy(struct kref *ref)
/*
* Wait for both *trigger_ptr from psi_trigger_replace and
- * poll_kworker RCUs to complete their read-side critical sections
- * before destroying the trigger and optionally the poll_kworker
+ * poll_task RCUs to complete their read-side critical sections
+ * before destroying the trigger and optionally the poll_task
*/
synchronize_rcu();
/*
* Destroy the kworker after releasing trigger_lock to prevent a
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
- if (kworker_to_destroy) {
+ if (task_to_destroy) {
/*
* After the RCU grace period has expired, the worker
- * can no longer be found through group->poll_kworker.
+ * can no longer be found through group->poll_task.
* But it might have been already scheduled before
* that - deschedule it cleanly before destroying it.
*/
- kthread_cancel_delayed_work_sync(&group->poll_work);
- atomic_set(&group->poll_scheduled, 0);
-
- kthread_destroy_worker(kworker_to_destroy);
+ del_timer_sync(&group->poll_timer);
+ kthread_stop(task_to_destroy);
}
kfree(t);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1d4e94c1e5fe..847f43fc9584 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -310,11 +310,26 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
__dl_update(dl_b, -((s32)tsk_bw / cpus));
}
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap,
+ u64 old_bw, u64 new_bw)
{
return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+ cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
+}
+
+/*
+ * Verify the fitness of task @p to run on @cpu taking into account the
+ * CPU original capacity and the runtime/deadline ratio of the task.
+ *
+ * The function will return true if the CPU original capacity of the
+ * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the
+ * task and false otherwise.
+ */
+static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu)
+{
+ unsigned long cap = arch_scale_cpu_capacity(cpu);
+
+ return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime;
}
extern void init_dl_bw(struct dl_bw *dl_b);
@@ -1682,7 +1697,7 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
#define WF_FORK 0x02 /* Child wakeup after fork */
#define WF_MIGRATED 0x04 /* Internal use, task got migrated */
-#define WF_ON_RQ 0x08 /* Wakee is on_rq */
+#define WF_ON_CPU 0x08 /* Wakee is on_cpu */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1748,7 +1763,7 @@ struct sched_class {
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
- bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
+ bool (*yield_to_task)(struct rq *rq, struct task_struct *p);
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 4c9e9975684f..3e50a6a8f1e5 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -102,12 +102,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
BUG(); /* how!?, what priority? */
}
-static unsigned int
-get_rr_interval_stop(struct rq *rq, struct task_struct *task)
-{
- return 0;
-}
-
static void update_curr_stop(struct rq *rq)
{
}
@@ -136,8 +130,6 @@ const struct sched_class stop_sched_class = {
.task_tick = task_tick_stop,
- .get_rr_interval = get_rr_interval_stop,
-
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
.update_curr = update_curr_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ba81187bb7af..9079d865a935 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1328,7 +1328,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd_flags = (*tl->sd_flags)();
if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
"wrong sd_flags in topology description\n"))
- sd_flags &= ~TOPOLOGY_SD_FLAGS;
+ sd_flags &= TOPOLOGY_SD_FLAGS;
/* Apply detected topology flags */
sd_flags |= dflags;
diff --git a/kernel/smp.c b/kernel/smp.c
index 472c2b274c65..aa17eedff5be 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -669,24 +669,6 @@ void __init smp_init(void)
{
int num_nodes, num_cpus;
- /*
- * Ensure struct irq_work layout matches so that
- * flush_smp_call_function_queue() can do horrible things.
- */
- BUILD_BUG_ON(offsetof(struct irq_work, llnode) !=
- offsetof(struct __call_single_data, llist));
- BUILD_BUG_ON(offsetof(struct irq_work, func) !=
- offsetof(struct __call_single_data, func));
- BUILD_BUG_ON(offsetof(struct irq_work, flags) !=
- offsetof(struct __call_single_data, flags));
-
- /*
- * Assert the CSD_TYPE_TTWU layout is similar enough
- * for task_struct to be on the @call_single_queue.
- */
- BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) !=
- offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist));
-
idle_threads_init();
cpuhp_threads_init();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index db1ce7af2563..4aea67d3d552 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1780,6 +1780,20 @@ static struct ctl_table kern_table[] = {
.proc_handler = sched_rt_handler,
},
{
+ .procname = "sched_deadline_period_max_us",
+ .data = &sysctl_sched_dl_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_deadline_period_min_us",
+ .data = &sysctl_sched_dl_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "sched_rr_timeslice_ms",
.data = &sysctl_sched_rr_timeslice,
.maxlen = sizeof(int),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1903b80db6eb..72064541bef2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2764,6 +2764,50 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
{
}
+/* List of trace_ops that have allocated trampolines */
+static LIST_HEAD(ftrace_ops_trampoline_list);
+
+static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops)
+{
+ lockdep_assert_held(&ftrace_lock);
+ list_add_rcu(&ops->list, &ftrace_ops_trampoline_list);
+}
+
+static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
+{
+ lockdep_assert_held(&ftrace_lock);
+ list_del_rcu(&ops->list);
+}
+
+/*
+ * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols
+ * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is
+ * not a module.
+ */
+#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace"
+#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline"
+
+static void ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+ if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
+ ops->trampoline) {
+ /*
+ * Record the text poke event before the ksymbol unregister
+ * event.
+ */
+ perf_event_text_poke((void *)ops->trampoline,
+ (void *)ops->trampoline,
+ ops->trampoline_size, NULL, 0);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ ops->trampoline, ops->trampoline_size,
+ true, FTRACE_TRAMPOLINE_SYM);
+ /* Remove from kallsyms after the perf events */
+ ftrace_remove_trampoline_from_kallsyms(ops);
+ }
+
+ arch_ftrace_trampoline_free(ops);
+}
+
static void ftrace_startup_enable(int command)
{
if (saved_ftrace_func != ftrace_trace_function) {
@@ -2934,7 +2978,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
synchronize_rcu_tasks();
free_ops:
- arch_ftrace_trampoline_free(ops);
+ ftrace_trampoline_free(ops);
}
return 0;
@@ -6178,6 +6222,27 @@ struct ftrace_mod_map {
unsigned int num_funcs;
};
+static int ftrace_get_trampoline_kallsym(unsigned int symnum,
+ unsigned long *value, char *type,
+ char *name, char *module_name,
+ int *exported)
+{
+ struct ftrace_ops *op;
+
+ list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) {
+ if (!op->trampoline || symnum--)
+ continue;
+ *value = op->trampoline;
+ *type = 't';
+ strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN);
+ strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN);
+ *exported = 0;
+ return 0;
+ }
+
+ return -ERANGE;
+}
+
#ifdef CONFIG_MODULES
#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -6514,6 +6579,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
{
struct ftrace_mod_map *mod_map;
struct ftrace_mod_func *mod_func;
+ int ret;
preempt_disable();
list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
@@ -6540,8 +6606,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
WARN_ON(1);
break;
}
+ ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ module_name, exported);
preempt_enable();
- return -ERANGE;
+ return ret;
}
#else
@@ -6553,6 +6621,18 @@ allocate_ftrace_mod_map(struct module *mod,
{
return NULL;
}
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name, char *module_name,
+ int *exported)
+{
+ int ret;
+
+ preempt_disable();
+ ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ module_name, exported);
+ preempt_enable();
+ return ret;
+}
#endif /* CONFIG_MODULES */
struct ftrace_init_func {
@@ -6733,7 +6813,24 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
static void ftrace_update_trampoline(struct ftrace_ops *ops)
{
+ unsigned long trampoline = ops->trampoline;
+
arch_ftrace_update_trampoline(ops);
+ if (ops->trampoline && ops->trampoline != trampoline &&
+ (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) {
+ /* Add to kallsyms before the perf events */
+ ftrace_add_trampoline_to_kallsyms(ops);
+ perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ ops->trampoline, ops->trampoline_size, false,
+ FTRACE_TRAMPOLINE_SYM);
+ /*
+ * Record the perf text poke event after the ksymbol register
+ * event.
+ */
+ perf_event_text_poke((void *)ops->trampoline, NULL, 0,
+ (void *)ops->trampoline,
+ ops->trampoline_size);
+ }
}
void ftrace_init_trace_array(struct trace_array *tr)
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 81f5464ea9e1..34b84bcbd3d9 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -15,11 +15,15 @@ config CC_HAS_KASAN_GENERIC
config CC_HAS_KASAN_SW_TAGS
def_bool $(cc-option, -fsanitize=kernel-hwaddress)
+config CC_HAS_WORKING_NOSANITIZE_ADDRESS
+ def_bool !CC_IS_GCC || GCC_VERSION >= 80300
+
config KASAN
bool "KASAN: runtime memory debugger"
depends on (HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC) || \
(HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)
depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB)
+ depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS
help
Enables KASAN (KernelAddressSANitizer) - runtime memory debugger,
designed to find out-of-bounds accesses and use-after-free bugs.
diff --git a/lib/math/div64.c b/lib/math/div64.c
index 368ca7fd0d82..3952a07130d8 100644
--- a/lib/math/div64.c
+++ b/lib/math/div64.c
@@ -190,3 +190,44 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
return __iter_div_u64_rem(dividend, divisor, remainder);
}
EXPORT_SYMBOL(iter_div_u64_rem);
+
+#ifndef mul_u64_u64_div_u64
+u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c)
+{
+ u64 res = 0, div, rem;
+ int shift;
+
+ /* can a * b overflow ? */
+ if (ilog2(a) + ilog2(b) > 62) {
+ /*
+ * (b * a) / c is equal to
+ *
+ * (b / c) * a +
+ * (b % c) * a / c
+ *
+ * if nothing overflows. Can the 1st multiplication
+ * overflow? Yes, but we do not care: this can only
+ * happen if the end result can't fit in u64 anyway.
+ *
+ * So the code below does
+ *
+ * res = (b / c) * a;
+ * b = b % c;
+ */
+ div = div64_u64_rem(b, c, &rem);
+ res = div * a;
+ b = rem;
+
+ shift = ilog2(a) + ilog2(b) - 62;
+ if (shift > 0) {
+ /* drop precision */
+ b >>= shift;
+ c >>= shift;
+ if (!c)
+ return res;
+ }
+ }
+
+ return res + div64_u64(a * b, c);
+}
+#endif
diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
index eda15a5a285e..2e2ce089b0e9 100644
--- a/tools/objtool/arch.h
+++ b/tools/objtool/arch.h
@@ -82,6 +82,8 @@ bool arch_callee_saved_reg(unsigned char reg);
unsigned long arch_jump_destination(struct instruction *insn);
-unsigned long arch_dest_rela_offset(int addend);
+unsigned long arch_dest_reloc_offset(int addend);
+
+const char *arch_nop_insn(int len);
#endif /* _ARCH_H */
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 4b504fc90bbb..1967370440b3 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -67,7 +67,7 @@ bool arch_callee_saved_reg(unsigned char reg)
}
}
-unsigned long arch_dest_rela_offset(int addend)
+unsigned long arch_dest_reloc_offset(int addend)
{
return addend + 4;
}
@@ -565,3 +565,21 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state)
state->regs[16].base = CFI_CFA;
state->regs[16].offset = -8;
}
+
+const char *arch_nop_insn(int len)
+{
+ static const char nops[5][5] = {
+ /* 1 */ { 0x90 },
+ /* 2 */ { 0x66, 0x90 },
+ /* 3 */ { 0x0f, 0x1f, 0x00 },
+ /* 4 */ { 0x0f, 0x1f, 0x40, 0x00 },
+ /* 5 */ { 0x0f, 0x1f, 0x44, 0x00, 0x00 },
+ };
+
+ if (len < 1 || len > 5) {
+ WARN("invalid NOP size: %d\n", len);
+ return NULL;
+ }
+
+ return nops[len-1];
+}
diff --git a/tools/objtool/arch/x86/include/arch_elf.h b/tools/objtool/arch/x86/include/arch_elf.h
new file mode 100644
index 000000000000..69cc4264b28a
--- /dev/null
+++ b/tools/objtool/arch/x86/include/arch_elf.h
@@ -0,0 +1,6 @@
+#ifndef _OBJTOOL_ARCH_ELF
+#define _OBJTOOL_ARCH_ELF
+
+#define R_NONE R_X86_64_NONE
+
+#endif /* _OBJTOOL_ARCH_ELF */
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 5fbb90a80d23..d8f898cdab5a 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -12,6 +12,7 @@
#include "check.h"
#include "special.h"
#include "warn.h"
+#include "arch_elf.h"
#include <linux/hashtable.h>
#include <linux/kernel.h>
@@ -352,7 +353,7 @@ static struct instruction *find_last_insn(struct objtool_file *file,
static int add_dead_ends(struct objtool_file *file)
{
struct section *sec;
- struct rela *rela;
+ struct reloc *reloc;
struct instruction *insn;
/*
@@ -370,24 +371,24 @@ static int add_dead_ends(struct objtool_file *file)
if (!sec)
goto reachable;
- list_for_each_entry(rela, &sec->rela_list, list) {
- if (rela->sym->type != STT_SECTION) {
+ list_for_each_entry(reloc, &sec->reloc_list, list) {
+ if (reloc->sym->type != STT_SECTION) {
WARN("unexpected relocation symbol type in %s", sec->name);
return -1;
}
- insn = find_insn(file, rela->sym->sec, rela->addend);
+ insn = find_insn(file, reloc->sym->sec, reloc->addend);
if (insn)
insn = list_prev_entry(insn, list);
- else if (rela->addend == rela->sym->sec->len) {
- insn = find_last_insn(file, rela->sym->sec);
+ else if (reloc->addend == reloc->sym->sec->len) {
+ insn = find_last_insn(file, reloc->sym->sec);
if (!insn) {
WARN("can't find unreachable insn at %s+0x%x",
- rela->sym->sec->name, rela->addend);
+ reloc->sym->sec->name, reloc->addend);
return -1;
}
} else {
WARN("can't find unreachable insn at %s+0x%x",
- rela->sym->sec->name, rela->addend);
+ reloc->sym->sec->name, reloc->addend);
return -1;
}
@@ -405,24 +406,24 @@ reachable:
if (!sec)
return 0;
- list_for_each_entry(rela, &sec->rela_list, list) {
- if (rela->sym->type != STT_SECTION) {
+ list_for_each_entry(reloc, &sec->reloc_list, list) {
+ if (reloc->sym->type != STT_SECTION) {
WARN("unexpected relocation symbol type in %s", sec->name);
return -1;
}
- insn = find_insn(file, rela->sym->sec, rela->addend);
+ insn = find_insn(file, reloc->sym->sec, reloc->addend);
if (insn)
insn = list_prev_entry(insn, list);
- else if (rela->addend == rela->sym->sec->len) {
- insn = find_last_insn(file, rela->sym->sec);
+ else if (reloc->addend == reloc->sym->sec->len) {
+ insn = find_last_insn(file, reloc->sym->sec);
if (!insn) {
WARN("can't find reachable insn at %s+0x%x",
- rela->sym->sec->name, rela->addend);
+ reloc->sym->sec->name, reloc->addend);
return -1;
}
} else {
WARN("can't find reachable insn at %s+0x%x",
- rela->sym->sec->name, rela->addend);
+ reloc->sym->sec->name, reloc->addend);
return -1;
}
@@ -440,26 +441,26 @@ static void add_ignores(struct objtool_file *file)
struct instruction *insn;
struct section *sec;
struct symbol *func;
- struct rela *rela;
+ struct reloc *reloc;
sec = find_section_by_name(file->elf, ".rela.discard.func_stack_frame_non_standard");
if (!sec)
return;
- list_for_each_entry(rela, &sec->rela_list, list) {
- switch (rela->sym->type) {
+ list_for_each_entry(reloc, &sec->reloc_list, list) {
+ switch (reloc->sym->type) {
case STT_FUNC:
- func = rela->sym;
+ func = reloc->sym;
break;
case STT_SECTION:
- func = find_func_by_offset(rela->sym->sec, rela->addend);
+ func = find_func_by_offset(reloc->sym->sec, reloc->addend);
if (!func)
continue;
break;
default:
- WARN("unexpected relocation symbol type in %s: %d", sec->name, rela->sym->type);
+ WARN("unexpected relocation symbol type in %s: %d", sec->name, reloc->sym->type);
continue;
}
@@ -579,20 +580,20 @@ static void add_uaccess_safe(struct objtool_file *file)
static int add_ignore_alternatives(struct objtool_file *file)
{
struct section *sec;
- struct rela *rela;
+ struct reloc *reloc;
struct instruction *insn;
sec = find_section_by_name(file->elf, ".rela.discard.ignore_alts");
if (!sec)
return 0;
- list_for_each_entry(rela, &sec->rela_list, list) {
- if (rela->sym->type != STT_SECTION) {
+ list_for_each_entry(reloc, &sec->reloc_list, list) {
+ if (reloc->sym->type != STT_SECTION) {
WARN("unexpected relocation symbol type in %s", sec->name);
return -1;
}
- insn = find_insn(file, rela->sym->sec, rela->addend);
+ insn = find_insn(file, reloc->sym->sec, reloc->addend);
if (!insn) {
WARN("bad .discard.ignore_alts entry");
return -1;
@@ -610,7 +611,7 @@ static int add_ignore_alternatives(struct objtool_file *file)
static int add_jump_destinations(struct objtool_file *file)
{
struct instruction *insn;
- struct rela *rela;
+ struct reloc *reloc;
struct section *dest_sec;
unsigned long dest_off;
@@ -621,19 +622,19 @@ static int add_jump_destinations(struct objtool_file *file)
if (insn->ignore || insn->offset == FAKE_JUMP_OFFSET)
continue;
- rela = find_rela_by_dest_range(file->elf, insn->sec,
+ reloc = find_reloc_by_dest_range(file->elf, insn->sec,
insn->offset, insn->len);
- if (!rela) {
+ if (!reloc) {
dest_sec = insn->sec;
dest_off = arch_jump_destination(insn);
- } else if (rela->sym->type == STT_SECTION) {
- dest_sec = rela->sym->sec;
- dest_off = arch_dest_rela_offset(rela->addend);
- } else if (rela->sym->sec->idx) {
- dest_sec = rela->sym->sec;
- dest_off = rela->sym->sym.st_value +
- arch_dest_rela_offset(rela->addend);
- } else if (strstr(rela->sym->name, "_indirect_thunk_")) {
+ } else if (reloc->sym->type == STT_SECTION) {
+ dest_sec = reloc->sym->sec;
+ dest_off = arch_dest_reloc_offset(reloc->addend);
+ } else if (reloc->sym->sec->idx) {
+ dest_sec = reloc->sym->sec;
+ dest_off = reloc->sym->sym.st_value +
+ arch_dest_reloc_offset(reloc->addend);
+ } else if (strstr(reloc->sym->name, "_indirect_thunk_")) {
/*
* Retpoline jumps are really dynamic jumps in
* disguise, so convert them accordingly.
@@ -647,7 +648,7 @@ static int add_jump_destinations(struct objtool_file *file)
continue;
} else {
/* external sibling call */
- insn->call_dest = rela->sym;
+ insn->call_dest = reloc->sym;
continue;
}
@@ -723,15 +724,15 @@ static int add_call_destinations(struct objtool_file *file)
{
struct instruction *insn;
unsigned long dest_off;
- struct rela *rela;
+ struct reloc *reloc;
for_each_insn(file, insn) {
if (insn->type != INSN_CALL)
continue;
- rela = find_rela_by_dest_range(file->elf, insn->sec,
+ reloc = find_reloc_by_dest_range(file->elf, insn->sec,
insn->offset, insn->len);
- if (!rela) {
+ if (!reloc) {
dest_off = arch_jump_destination(insn);
insn->call_dest = find_func_by_offset(insn->sec, dest_off);
if (!insn->call_dest)
@@ -751,19 +752,37 @@ static int add_call_destinations(struct objtool_file *file)
return -1;
}
- } else if (rela->sym->type == STT_SECTION) {
- dest_off = arch_dest_rela_offset(rela->addend);
- insn->call_dest = find_func_by_offset(rela->sym->sec,
+ } else if (reloc->sym->type == STT_SECTION) {
+ dest_off = arch_dest_reloc_offset(reloc->addend);
+ insn->call_dest = find_func_by_offset(reloc->sym->sec,
dest_off);
if (!insn->call_dest) {
WARN_FUNC("can't find call dest symbol at %s+0x%lx",
insn->sec, insn->offset,
- rela->sym->sec->name,
+ reloc->sym->sec->name,
dest_off);
return -1;
}
} else
- insn->call_dest = rela->sym;
+ insn->call_dest = reloc->sym;
+
+ /*
+ * Many compilers cannot disable KCOV with a function attribute
+ * so they need a little help, NOP out any KCOV calls from noinstr
+ * text.
+ */
+ if (insn->sec->noinstr &&
+ !strncmp(insn->call_dest->name, "__sanitizer_cov_", 16)) {
+ if (reloc) {
+ reloc->type = R_NONE;
+ elf_write_reloc(file->elf, reloc);
+ }
+
+ elf_write_insn(file->elf, insn->sec,
+ insn->offset, insn->len,
+ arch_nop_insn(insn->len));
+ insn->type = INSN_NOP;
+ }
/*
* Whatever stack impact regular CALLs have, should be undone
@@ -871,7 +890,7 @@ static int handle_group_alt(struct objtool_file *file,
*/
if ((insn->offset != special_alt->new_off ||
(insn->type != INSN_CALL && !is_static_jump(insn))) &&
- find_rela_by_dest_range(file->elf, insn->sec, insn->offset, insn->len)) {
+ find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len)) {
WARN_FUNC("unsupported relocation in alternatives section",
insn->sec, insn->offset);
@@ -1017,34 +1036,34 @@ out:
}
static int add_jump_table(struct objtool_file *file, struct instruction *insn,
- struct rela *table)
+ struct reloc *table)
{
- struct rela *rela = table;
+ struct reloc *reloc = table;
struct instruction *dest_insn;
struct alternative *alt;
struct symbol *pfunc = insn->func->pfunc;
unsigned int prev_offset = 0;
/*
- * Each @rela is a switch table relocation which points to the target
+ * Each @reloc is a switch table relocation which points to the target
* instruction.
*/
- list_for_each_entry_from(rela, &table->sec->rela_list, list) {
+ list_for_each_entry_from(reloc, &table->sec->reloc_list, list) {
/* Check for the end of the table: */
- if (rela != table && rela->jump_table_start)
+ if (reloc != table && reloc->jump_table_start)
break;
/* Make sure the table entries are consecutive: */
- if (prev_offset && rela->offset != prev_offset + 8)
+ if (prev_offset && reloc->offset != prev_offset + 8)
break;
/* Detect function pointers from contiguous objects: */
- if (rela->sym->sec == pfunc->sec &&
- rela->addend == pfunc->offset)
+ if (reloc->sym->sec == pfunc->sec &&
+ reloc->addend == pfunc->offset)
break;
- dest_insn = find_insn(file, rela->sym->sec, rela->addend);
+ dest_insn = find_insn(file, reloc->sym->sec, reloc->addend);
if (!dest_insn)
break;
@@ -1060,7 +1079,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn,
alt->insn = dest_insn;
list_add_tail(&alt->list, &insn->alts);
- prev_offset = rela->offset;
+ prev_offset = reloc->offset;
}
if (!prev_offset) {
@@ -1115,11 +1134,11 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn,
*
* NOTE: RETPOLINE made it harder still to decode dynamic jumps.
*/
-static struct rela *find_jump_table(struct objtool_file *file,
+static struct reloc *find_jump_table(struct objtool_file *file,
struct symbol *func,
struct instruction *insn)
{
- struct rela *text_rela, *table_rela;
+ struct reloc *text_reloc, *table_reloc;
struct instruction *dest_insn, *orig_insn = insn;
struct section *table_sec;
unsigned long table_offset;
@@ -1144,16 +1163,16 @@ static struct rela *find_jump_table(struct objtool_file *file,
break;
/* look for a relocation which references .rodata */
- text_rela = find_rela_by_dest_range(file->elf, insn->sec,
+ text_reloc = find_reloc_by_dest_range(file->elf, insn->sec,
insn->offset, insn->len);
- if (!text_rela || text_rela->sym->type != STT_SECTION ||
- !text_rela->sym->sec->rodata)
+ if (!text_reloc || text_reloc->sym->type != STT_SECTION ||
+ !text_reloc->sym->sec->rodata)
continue;
- table_offset = text_rela->addend;
- table_sec = text_rela->sym->sec;
+ table_offset = text_reloc->addend;
+ table_sec = text_reloc->sym->sec;
- if (text_rela->type == R_X86_64_PC32)
+ if (text_reloc->type == R_X86_64_PC32)
table_offset += 4;
/*
@@ -1170,14 +1189,14 @@ static struct rela *find_jump_table(struct objtool_file *file,
continue;
/*
- * Each table entry has a rela associated with it. The rela
+ * Each table entry has a reloc associated with it. The reloc
* should reference text in the same function as the original
* instruction.
*/
- table_rela = find_rela_by_dest(file->elf, table_sec, table_offset);
- if (!table_rela)
+ table_reloc = find_reloc_by_dest(file->elf, table_sec, table_offset);
+ if (!table_reloc)
continue;
- dest_insn = find_insn(file, table_rela->sym->sec, table_rela->addend);
+ dest_insn = find_insn(file, table_reloc->sym->sec, table_reloc->addend);
if (!dest_insn || !dest_insn->func || dest_insn->func->pfunc != func)
continue;
@@ -1186,10 +1205,10 @@ static struct rela *find_jump_table(struct objtool_file *file,
* indicates a rare GCC quirk/bug which can leave dead code
* behind.
*/
- if (text_rela->type == R_X86_64_PC32)
+ if (text_reloc->type == R_X86_64_PC32)
file->ignore_unreachables = true;
- return table_rela;
+ return table_reloc;
}
return NULL;
@@ -1203,7 +1222,7 @@ static void mark_func_jump_tables(struct objtool_file *file,
struct symbol *func)
{
struct instruction *insn, *last = NULL;
- struct rela *rela;
+ struct reloc *reloc;
func_for_each_insn(file, func, insn) {
if (!last)
@@ -1226,10 +1245,10 @@ static void mark_func_jump_tables(struct objtool_file *file,
if (insn->type != INSN_JUMP_DYNAMIC)
continue;
- rela = find_jump_table(file, func, insn);
- if (rela) {
- rela->jump_table_start = true;
- insn->jump_table = rela;
+ reloc = find_jump_table(file, func, insn);
+ if (reloc) {
+ reloc->jump_table_start = true;
+ insn->jump_table = reloc;
}
}
}
@@ -1283,8 +1302,8 @@ static int add_jump_table_alts(struct objtool_file *file)
static int read_unwind_hints(struct objtool_file *file)
{
- struct section *sec, *relasec;
- struct rela *rela;
+ struct section *sec, *relocsec;
+ struct reloc *reloc;
struct unwind_hint *hint;
struct instruction *insn;
struct cfi_reg *cfa;
@@ -1294,8 +1313,8 @@ static int read_unwind_hints(struct objtool_file *file)
if (!sec)
return 0;
- relasec = sec->rela;
- if (!relasec) {
+ relocsec = sec->reloc;
+ if (!relocsec) {
WARN("missing .rela.discard.unwind_hints section");
return -1;
}
@@ -1310,13 +1329,13 @@ static int read_unwind_hints(struct objtool_file *file)
for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) {
hint = (struct unwind_hint *)sec->data->d_buf + i;
- rela = find_rela_by_dest(file->elf, sec, i * sizeof(*hint));
- if (!rela) {
- WARN("can't find rela for unwind_hints[%d]", i);
+ reloc = find_reloc_by_dest(file->elf, sec, i * sizeof(*hint));
+ if (!reloc) {
+ WARN("can't find reloc for unwind_hints[%d]", i);
return -1;
}
- insn = find_insn(file, rela->sym->sec, rela->addend);
+ insn = find_insn(file, reloc->sym->sec, reloc->addend);
if (!insn) {
WARN("can't find insn for unwind_hints[%d]", i);
return -1;
@@ -1374,19 +1393,19 @@ static int read_retpoline_hints(struct objtool_file *file)
{
struct section *sec;
struct instruction *insn;
- struct rela *rela;
+ struct reloc *reloc;
sec = find_section_by_name(file->elf, ".rela.discard.retpoline_safe");
if (!sec)
return 0;
- list_for_each_entry(rela, &sec->rela_list, list) {
- if (rela->sym->type != STT_SECTION) {
+ list_for_each_entry(reloc, &sec->reloc_list, list) {
+ if (reloc->sym->type != STT_SECTION) {
WARN("unexpected relocation symbol type in %s", sec->name);
return -1;
}
- insn = find_insn(file, rela->sym->sec, rela->addend);
+ insn = find_insn(file, reloc->sym->sec, reloc->addend);
if (!insn) {
WARN("bad .discard.retpoline_safe entry");
return -1;
@@ -1409,19 +1428,19 @@ static int read_instr_hints(struct objtool_file *file)
{
struct section *sec;
struct instruction *insn;
- struct rela *rela;
+ struct reloc *reloc;
sec = find_section_by_name(file->elf, ".rela.discard.instr_end");
if (!sec)
return 0;
- list_for_each_entry(rela, &sec->rela_list, list) {
- if (rela->sym->type != STT_SECTION) {
+ list_for_each_entry(reloc, &sec->reloc_list, list) {
+ if (reloc->sym->type != STT_SECTION) {
WARN("unexpected relocation symbol type in %s", sec->name);
return -1;
}
- insn = find_insn(file, rela->sym->sec, rela->addend);
+ insn = find_insn(file, reloc->sym->sec, reloc->addend);
if (!insn) {
WARN("bad .discard.instr_end entry");
return -1;
@@ -1434,13 +1453,13 @@ static int read_instr_hints(struct objtool_file *file)
if (!sec)
return 0;
- list_for_each_entry(rela, &sec->rela_list, list) {
- if (rela->sym->type != STT_SECTION) {
+ list_for_each_entry(reloc, &sec->reloc_list, list) {
+ if (reloc->sym->type != STT_SECTION) {
WARN("unexpected relocation symbol type in %s", sec->name);
return -1;
}
- insn = find_insn(file, rela->sym->sec, rela->addend);
+ insn = find_insn(file, reloc->sym->sec, reloc->addend);
if (!insn) {
WARN("bad .discard.instr_begin entry");
return -1;
@@ -1456,22 +1475,22 @@ static int read_intra_function_calls(struct objtool_file *file)
{
struct instruction *insn;
struct section *sec;
- struct rela *rela;
+ struct reloc *reloc;
sec = find_section_by_name(file->elf, ".rela.discard.intra_function_calls");
if (!sec)
return 0;
- list_for_each_entry(rela, &sec->rela_list, list) {
+ list_for_each_entry(reloc, &sec->reloc_list, list) {
unsigned long dest_off;
- if (rela->sym->type != STT_SECTION) {
+ if (reloc->sym->type != STT_SECTION) {
WARN("unexpected relocation symbol type in %s",
sec->name);
return -1;
}
- insn = find_insn(file, rela->sym->sec, rela->addend);
+ insn = find_insn(file, reloc->sym->sec, reloc->addend);
if (!insn) {
WARN("bad .discard.intra_function_call entry");
return -1;
@@ -2190,10 +2209,36 @@ static inline const char *call_dest_name(struct instruction *insn)
return "{dynamic}";
}
+static inline bool noinstr_call_dest(struct symbol *func)
+{
+ /*
+ * We can't deal with indirect function calls at present;
+ * assume they're instrumented.
+ */
+ if (!func)
+ return false;
+
+ /*
+ * If the symbol is from a noinstr section; we good.
+ */
+ if (func->sec->noinstr)
+ return true;
+
+ /*
+ * The __ubsan_handle_*() calls are like WARN(), they only happen when
+ * something 'BAD' happened. At the risk of taking the machine down,
+ * let them proceed to get the message out.
+ */
+ if (!strncmp(func->name, "__ubsan_handle_", 15))
+ return true;
+
+ return false;
+}
+
static int validate_call(struct instruction *insn, struct insn_state *state)
{
if (state->noinstr && state->instr <= 0 &&
- (!insn->call_dest || !insn->call_dest->sec->noinstr)) {
+ !noinstr_call_dest(insn->call_dest)) {
WARN_FUNC("call to %s() leaves .noinstr.text section",
insn->sec, insn->offset, call_dest_name(insn));
return 1;
@@ -2740,7 +2785,7 @@ int check(const char *_objname, bool orc)
objname = _objname;
- file.elf = elf_open_read(objname, orc ? O_RDWR : O_RDONLY);
+ file.elf = elf_open_read(objname, O_RDWR);
if (!file.elf)
return 1;
@@ -2801,7 +2846,9 @@ int check(const char *_objname, bool orc)
ret = create_orc_sections(&file);
if (ret < 0)
goto out;
+ }
+ if (file.elf->changed) {
ret = elf_write(file.elf);
if (ret < 0)
goto out;
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index 906b5210f7ca..061aa96e15d3 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -37,7 +37,7 @@ struct instruction {
struct symbol *call_dest;
struct instruction *jump_dest;
struct instruction *first_jump_src;
- struct rela *jump_table;
+ struct reloc *jump_table;
struct list_head alts;
struct symbol *func;
struct list_head stack_ops;
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 84225679f96d..3ddbd66f1a37 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -228,26 +228,26 @@ struct symbol *find_symbol_by_name(const struct elf *elf, const char *name)
return NULL;
}
-struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec,
+struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec,
unsigned long offset, unsigned int len)
{
- struct rela *rela, *r = NULL;
+ struct reloc *reloc, *r = NULL;
unsigned long o;
- if (!sec->rela)
+ if (!sec->reloc)
return NULL;
- sec = sec->rela;
+ sec = sec->reloc;
for_offset_range(o, offset, offset + len) {
- elf_hash_for_each_possible(elf->rela_hash, rela, hash,
+ elf_hash_for_each_possible(elf->reloc_hash, reloc, hash,
sec_offset_hash(sec, o)) {
- if (rela->sec != sec)
+ if (reloc->sec != sec)
continue;
- if (rela->offset >= offset && rela->offset < offset + len) {
- if (!r || rela->offset < r->offset)
- r = rela;
+ if (reloc->offset >= offset && reloc->offset < offset + len) {
+ if (!r || reloc->offset < r->offset)
+ r = reloc;
}
}
if (r)
@@ -257,9 +257,9 @@ struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec,
return NULL;
}
-struct rela *find_rela_by_dest(const struct elf *elf, struct section *sec, unsigned long offset)
+struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset)
{
- return find_rela_by_dest_range(elf, sec, offset, 1);
+ return find_reloc_by_dest_range(elf, sec, offset, 1);
}
static int read_sections(struct elf *elf)
@@ -288,7 +288,7 @@ static int read_sections(struct elf *elf)
memset(sec, 0, sizeof(*sec));
INIT_LIST_HEAD(&sec->symbol_list);
- INIT_LIST_HEAD(&sec->rela_list);
+ INIT_LIST_HEAD(&sec->reloc_list);
s = elf_getscn(elf->elf, i);
if (!s) {
@@ -434,7 +434,13 @@ static int read_symbols(struct elf *elf)
size_t pnamelen;
if (sym->type != STT_FUNC)
continue;
- sym->pfunc = sym->cfunc = sym;
+
+ if (sym->pfunc == NULL)
+ sym->pfunc = sym;
+
+ if (sym->cfunc == NULL)
+ sym->cfunc = sym;
+
coldstr = strstr(sym->name, ".cold");
if (!coldstr)
continue;
@@ -482,71 +488,101 @@ err:
return -1;
}
-void elf_add_rela(struct elf *elf, struct rela *rela)
+void elf_add_reloc(struct elf *elf, struct reloc *reloc)
+{
+ struct section *sec = reloc->sec;
+
+ list_add_tail(&reloc->list, &sec->reloc_list);
+ elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc));
+}
+
+static int read_rel_reloc(struct section *sec, int i, struct reloc *reloc, unsigned int *symndx)
{
- struct section *sec = rela->sec;
+ if (!gelf_getrel(sec->data, i, &reloc->rel)) {
+ WARN_ELF("gelf_getrel");
+ return -1;
+ }
+ reloc->type = GELF_R_TYPE(reloc->rel.r_info);
+ reloc->addend = 0;
+ reloc->offset = reloc->rel.r_offset;
+ *symndx = GELF_R_SYM(reloc->rel.r_info);
+ return 0;
+}
- list_add_tail(&rela->list, &sec->rela_list);
- elf_hash_add(elf->rela_hash, &rela->hash, rela_hash(rela));
+static int read_rela_reloc(struct section *sec, int i, struct reloc *reloc, unsigned int *symndx)
+{
+ if (!gelf_getrela(sec->data, i, &reloc->rela)) {
+ WARN_ELF("gelf_getrela");
+ return -1;
+ }
+ reloc->type = GELF_R_TYPE(reloc->rela.r_info);
+ reloc->addend = reloc->rela.r_addend;
+ reloc->offset = reloc->rela.r_offset;
+ *symndx = GELF_R_SYM(reloc->rela.r_info);
+ return 0;
}
-static int read_relas(struct elf *elf)
+static int read_relocs(struct elf *elf)
{
struct section *sec;
- struct rela *rela;
+ struct reloc *reloc;
int i;
unsigned int symndx;
- unsigned long nr_rela, max_rela = 0, tot_rela = 0;
+ unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0;
list_for_each_entry(sec, &elf->sections, list) {
- if (sec->sh.sh_type != SHT_RELA)
+ if ((sec->sh.sh_type != SHT_RELA) &&
+ (sec->sh.sh_type != SHT_REL))
continue;
- sec->base = find_section_by_name(elf, sec->name + 5);
+ sec->base = find_section_by_index(elf, sec->sh.sh_info);
if (!sec->base) {
- WARN("can't find base section for rela section %s",
+ WARN("can't find base section for reloc section %s",
sec->name);
return -1;
}
- sec->base->rela = sec;
+ sec->base->reloc = sec;
- nr_rela = 0;
+ nr_reloc = 0;
for (i = 0; i < sec->sh.sh_size / sec->sh.sh_entsize; i++) {
- rela = malloc(sizeof(*rela));
- if (!rela) {
+ reloc = malloc(sizeof(*reloc));
+ if (!reloc) {
perror("malloc");
return -1;
}
- memset(rela, 0, sizeof(*rela));
-
- if (!gelf_getrela(sec->data, i, &rela->rela)) {
- WARN_ELF("gelf_getrela");
- return -1;
+ memset(reloc, 0, sizeof(*reloc));
+ switch (sec->sh.sh_type) {
+ case SHT_REL:
+ if (read_rel_reloc(sec, i, reloc, &symndx))
+ return -1;
+ break;
+ case SHT_RELA:
+ if (read_rela_reloc(sec, i, reloc, &symndx))
+ return -1;
+ break;
+ default: return -1;
}
- rela->type = GELF_R_TYPE(rela->rela.r_info);
- rela->addend = rela->rela.r_addend;
- rela->offset = rela->rela.r_offset;
- symndx = GELF_R_SYM(rela->rela.r_info);
- rela->sym = find_symbol_by_index(elf, symndx);
- rela->sec = sec;
- if (!rela->sym) {
- WARN("can't find rela entry symbol %d for %s",
+ reloc->sec = sec;
+ reloc->idx = i;
+ reloc->sym = find_symbol_by_index(elf, symndx);
+ if (!reloc->sym) {
+ WARN("can't find reloc entry symbol %d for %s",
symndx, sec->name);
return -1;
}
- elf_add_rela(elf, rela);
- nr_rela++;
+ elf_add_reloc(elf, reloc);
+ nr_reloc++;
}
- max_rela = max(max_rela, nr_rela);
- tot_rela += nr_rela;
+ max_reloc = max(max_reloc, nr_reloc);
+ tot_reloc += nr_reloc;
}
if (stats) {
- printf("max_rela: %lu\n", max_rela);
- printf("tot_rela: %lu\n", tot_rela);
+ printf("max_reloc: %lu\n", max_reloc);
+ printf("tot_reloc: %lu\n", tot_reloc);
}
return 0;
@@ -572,7 +608,7 @@ struct elf *elf_open_read(const char *name, int flags)
elf_hash_init(elf->symbol_name_hash);
elf_hash_init(elf->section_hash);
elf_hash_init(elf->section_name_hash);
- elf_hash_init(elf->rela_hash);
+ elf_hash_init(elf->reloc_hash);
elf->fd = open(name, flags);
if (elf->fd == -1) {
@@ -605,7 +641,7 @@ struct elf *elf_open_read(const char *name, int flags)
if (read_symbols(elf))
goto err;
- if (read_relas(elf))
+ if (read_relocs(elf))
goto err;
return elf;
@@ -631,7 +667,7 @@ struct section *elf_create_section(struct elf *elf, const char *name,
memset(sec, 0, sizeof(*sec));
INIT_LIST_HEAD(&sec->symbol_list);
- INIT_LIST_HEAD(&sec->rela_list);
+ INIT_LIST_HEAD(&sec->reloc_list);
s = elf_newscn(elf->elf);
if (!s) {
@@ -713,28 +749,60 @@ struct section *elf_create_section(struct elf *elf, const char *name,
elf_hash_add(elf->section_hash, &sec->hash, sec->idx);
elf_hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name));
+ elf->changed = true;
+
return sec;
}
-struct section *elf_create_rela_section(struct elf *elf, struct section *base)
+static struct section *elf_create_rel_reloc_section(struct elf *elf, struct section *base)
{
- char *relaname;
+ char *relocname;
struct section *sec;
- relaname = malloc(strlen(base->name) + strlen(".rela") + 1);
- if (!relaname) {
+ relocname = malloc(strlen(base->name) + strlen(".rel") + 1);
+ if (!relocname) {
perror("malloc");
return NULL;
}
- strcpy(relaname, ".rela");
- strcat(relaname, base->name);
+ strcpy(relocname, ".rel");
+ strcat(relocname, base->name);
- sec = elf_create_section(elf, relaname, sizeof(GElf_Rela), 0);
- free(relaname);
+ sec = elf_create_section(elf, relocname, sizeof(GElf_Rel), 0);
+ free(relocname);
if (!sec)
return NULL;
- base->rela = sec;
+ base->reloc = sec;
+ sec->base = base;
+
+ sec->sh.sh_type = SHT_REL;
+ sec->sh.sh_addralign = 8;
+ sec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx;
+ sec->sh.sh_info = base->idx;
+ sec->sh.sh_flags = SHF_INFO_LINK;
+
+ return sec;
+}
+
+static struct section *elf_create_rela_reloc_section(struct elf *elf, struct section *base)
+{
+ char *relocname;
+ struct section *sec;
+
+ relocname = malloc(strlen(base->name) + strlen(".rela") + 1);
+ if (!relocname) {
+ perror("malloc");
+ return NULL;
+ }
+ strcpy(relocname, ".rela");
+ strcat(relocname, base->name);
+
+ sec = elf_create_section(elf, relocname, sizeof(GElf_Rela), 0);
+ free(relocname);
+ if (!sec)
+ return NULL;
+
+ base->reloc = sec;
sec->base = base;
sec->sh.sh_type = SHT_RELA;
@@ -746,40 +814,143 @@ struct section *elf_create_rela_section(struct elf *elf, struct section *base)
return sec;
}
-int elf_rebuild_rela_section(struct section *sec)
+struct section *elf_create_reloc_section(struct elf *elf,
+ struct section *base,
+ int reltype)
{
- struct rela *rela;
- int nr, idx = 0, size;
- GElf_Rela *relas;
+ switch (reltype) {
+ case SHT_REL: return elf_create_rel_reloc_section(elf, base);
+ case SHT_RELA: return elf_create_rela_reloc_section(elf, base);
+ default: return NULL;
+ }
+}
- nr = 0;
- list_for_each_entry(rela, &sec->rela_list, list)
- nr++;
+static int elf_rebuild_rel_reloc_section(struct section *sec, int nr)
+{
+ struct reloc *reloc;
+ int idx = 0, size;
+ GElf_Rel *relocs;
+
+ /* Allocate a buffer for relocations */
+ size = nr * sizeof(*relocs);
+ relocs = malloc(size);
+ if (!relocs) {
+ perror("malloc");
+ return -1;
+ }
+
+ sec->data->d_buf = relocs;
+ sec->data->d_size = size;
+
+ sec->sh.sh_size = size;
+
+ idx = 0;
+ list_for_each_entry(reloc, &sec->reloc_list, list) {
+ relocs[idx].r_offset = reloc->offset;
+ relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+ idx++;
+ }
+
+ return 0;
+}
- size = nr * sizeof(*relas);
- relas = malloc(size);
- if (!relas) {
+static int elf_rebuild_rela_reloc_section(struct section *sec, int nr)
+{
+ struct reloc *reloc;
+ int idx = 0, size;
+ GElf_Rela *relocs;
+
+ /* Allocate a buffer for relocations with addends */
+ size = nr * sizeof(*relocs);
+ relocs = malloc(size);
+ if (!relocs) {
perror("malloc");
return -1;
}
- sec->data->d_buf = relas;
+ sec->data->d_buf = relocs;
sec->data->d_size = size;
sec->sh.sh_size = size;
idx = 0;
- list_for_each_entry(rela, &sec->rela_list, list) {
- relas[idx].r_offset = rela->offset;
- relas[idx].r_addend = rela->addend;
- relas[idx].r_info = GELF_R_INFO(rela->sym->idx, rela->type);
+ list_for_each_entry(reloc, &sec->reloc_list, list) {
+ relocs[idx].r_offset = reloc->offset;
+ relocs[idx].r_addend = reloc->addend;
+ relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
idx++;
}
return 0;
}
-int elf_write(const struct elf *elf)
+int elf_rebuild_reloc_section(struct elf *elf, struct section *sec)
+{
+ struct reloc *reloc;
+ int nr;
+
+ sec->changed = true;
+ elf->changed = true;
+
+ nr = 0;
+ list_for_each_entry(reloc, &sec->reloc_list, list)
+ nr++;
+
+ switch (sec->sh.sh_type) {
+ case SHT_REL: return elf_rebuild_rel_reloc_section(sec, nr);
+ case SHT_RELA: return elf_rebuild_rela_reloc_section(sec, nr);
+ default: return -1;
+ }
+}
+
+int elf_write_insn(struct elf *elf, struct section *sec,
+ unsigned long offset, unsigned int len,
+ const char *insn)
+{
+ Elf_Data *data = sec->data;
+
+ if (data->d_type != ELF_T_BYTE || data->d_off) {
+ WARN("write to unexpected data for section: %s", sec->name);
+ return -1;
+ }
+
+ memcpy(data->d_buf + offset, insn, len);
+ elf_flagdata(data, ELF_C_SET, ELF_F_DIRTY);
+
+ elf->changed = true;
+
+ return 0;
+}
+
+int elf_write_reloc(struct elf *elf, struct reloc *reloc)
+{
+ struct section *sec = reloc->sec;
+
+ if (sec->sh.sh_type == SHT_REL) {
+ reloc->rel.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+ reloc->rel.r_offset = reloc->offset;
+
+ if (!gelf_update_rel(sec->data, reloc->idx, &reloc->rel)) {
+ WARN_ELF("gelf_update_rel");
+ return -1;
+ }
+ } else {
+ reloc->rela.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+ reloc->rela.r_addend = reloc->addend;
+ reloc->rela.r_offset = reloc->offset;
+
+ if (!gelf_update_rela(sec->data, reloc->idx, &reloc->rela)) {
+ WARN_ELF("gelf_update_rela");
+ return -1;
+ }
+ }
+
+ elf->changed = true;
+
+ return 0;
+}
+
+int elf_write(struct elf *elf)
{
struct section *sec;
Elf_Scn *s;
@@ -796,6 +967,8 @@ int elf_write(const struct elf *elf)
WARN_ELF("gelf_update_shdr");
return -1;
}
+
+ sec->changed = false;
}
}
@@ -808,6 +981,8 @@ int elf_write(const struct elf *elf)
return -1;
}
+ elf->changed = false;
+
return 0;
}
@@ -815,7 +990,7 @@ void elf_close(struct elf *elf)
{
struct section *sec, *tmpsec;
struct symbol *sym, *tmpsym;
- struct rela *rela, *tmprela;
+ struct reloc *reloc, *tmpreloc;
if (elf->elf)
elf_end(elf->elf);
@@ -829,10 +1004,10 @@ void elf_close(struct elf *elf)
hash_del(&sym->hash);
free(sym);
}
- list_for_each_entry_safe(rela, tmprela, &sec->rela_list, list) {
- list_del(&rela->list);
- hash_del(&rela->hash);
- free(rela);
+ list_for_each_entry_safe(reloc, tmpreloc, &sec->reloc_list, list) {
+ list_del(&reloc->list);
+ hash_del(&reloc->hash);
+ free(reloc);
}
list_del(&sec->list);
free(sec);
diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
index f4fe1d6ea392..6cc80a075166 100644
--- a/tools/objtool/elf.h
+++ b/tools/objtool/elf.h
@@ -32,8 +32,8 @@ struct section {
GElf_Shdr sh;
struct rb_root symbol_tree;
struct list_head symbol_list;
- struct list_head rela_list;
- struct section *base, *rela;
+ struct list_head reloc_list;
+ struct section *base, *reloc;
struct symbol *sym;
Elf_Data *data;
char *name;
@@ -58,15 +58,19 @@ struct symbol {
bool uaccess_safe;
};
-struct rela {
+struct reloc {
struct list_head list;
struct hlist_node hash;
- GElf_Rela rela;
+ union {
+ GElf_Rela rela;
+ GElf_Rel rel;
+ };
struct section *sec;
struct symbol *sym;
- unsigned int type;
unsigned long offset;
+ unsigned int type;
int addend;
+ int idx;
bool jump_table_start;
};
@@ -76,13 +80,14 @@ struct elf {
Elf *elf;
GElf_Ehdr ehdr;
int fd;
+ bool changed;
char *name;
struct list_head sections;
DECLARE_HASHTABLE(symbol_hash, ELF_HASH_BITS);
DECLARE_HASHTABLE(symbol_name_hash, ELF_HASH_BITS);
DECLARE_HASHTABLE(section_hash, ELF_HASH_BITS);
DECLARE_HASHTABLE(section_name_hash, ELF_HASH_BITS);
- DECLARE_HASHTABLE(rela_hash, ELF_HASH_BITS);
+ DECLARE_HASHTABLE(reloc_hash, ELF_HASH_BITS);
};
#define OFFSET_STRIDE_BITS 4
@@ -109,16 +114,20 @@ static inline u32 sec_offset_hash(struct section *sec, unsigned long offset)
return ol;
}
-static inline u32 rela_hash(struct rela *rela)
+static inline u32 reloc_hash(struct reloc *reloc)
{
- return sec_offset_hash(rela->sec, rela->offset);
+ return sec_offset_hash(reloc->sec, reloc->offset);
}
struct elf *elf_open_read(const char *name, int flags);
struct section *elf_create_section(struct elf *elf, const char *name, size_t entsize, int nr);
-struct section *elf_create_rela_section(struct elf *elf, struct section *base);
-void elf_add_rela(struct elf *elf, struct rela *rela);
-int elf_write(const struct elf *elf);
+struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype);
+void elf_add_reloc(struct elf *elf, struct reloc *reloc);
+int elf_write_insn(struct elf *elf, struct section *sec,
+ unsigned long offset, unsigned int len,
+ const char *insn);
+int elf_write_reloc(struct elf *elf, struct reloc *reloc);
+int elf_write(struct elf *elf);
void elf_close(struct elf *elf);
struct section *find_section_by_name(const struct elf *elf, const char *name);
@@ -126,11 +135,11 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset);
struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset);
struct symbol *find_symbol_by_name(const struct elf *elf, const char *name);
struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset);
-struct rela *find_rela_by_dest(const struct elf *elf, struct section *sec, unsigned long offset);
-struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec,
+struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset);
+struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec,
unsigned long offset, unsigned int len);
struct symbol *find_func_containing(struct section *sec, unsigned long offset);
-int elf_rebuild_rela_section(struct section *sec);
+int elf_rebuild_reloc_section(struct elf *elf, struct section *sec);
#define for_each_sec(file, sec) \
list_for_each_entry(sec, &file->elf->sections, list)
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index c9549988121a..968f55e6dd94 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -80,56 +80,56 @@ int create_orc(struct objtool_file *file)
return 0;
}
-static int create_orc_entry(struct elf *elf, struct section *u_sec, struct section *ip_relasec,
+static int create_orc_entry(struct elf *elf, struct section *u_sec, struct section *ip_relocsec,
unsigned int idx, struct section *insn_sec,
unsigned long insn_off, struct orc_entry *o)
{
struct orc_entry *orc;
- struct rela *rela;
+ struct reloc *reloc;
/* populate ORC data */
orc = (struct orc_entry *)u_sec->data->d_buf + idx;
memcpy(orc, o, sizeof(*orc));
- /* populate rela for ip */
- rela = malloc(sizeof(*rela));
- if (!rela) {
+ /* populate reloc for ip */
+ reloc = malloc(sizeof(*reloc));
+ if (!reloc) {
perror("malloc");
return -1;
}
- memset(rela, 0, sizeof(*rela));
+ memset(reloc, 0, sizeof(*reloc));
if (insn_sec->sym) {
- rela->sym = insn_sec->sym;
- rela->addend = insn_off;
+ reloc->sym = insn_sec->sym;
+ reloc->addend = insn_off;
} else {
/*
* The Clang assembler doesn't produce section symbols, so we
* have to reference the function symbol instead:
*/
- rela->sym = find_symbol_containing(insn_sec, insn_off);
- if (!rela->sym) {
+ reloc->sym = find_symbol_containing(insn_sec, insn_off);
+ if (!reloc->sym) {
/*
* Hack alert. This happens when we need to reference
* the NOP pad insn immediately after the function.
*/
- rela->sym = find_symbol_containing(insn_sec,
+ reloc->sym = find_symbol_containing(insn_sec,
insn_off - 1);
}
- if (!rela->sym) {
+ if (!reloc->sym) {
WARN("missing symbol for insn at offset 0x%lx\n",
insn_off);
return -1;
}
- rela->addend = insn_off - rela->sym->offset;
+ reloc->addend = insn_off - reloc->sym->offset;
}
- rela->type = R_X86_64_PC32;
- rela->offset = idx * sizeof(int);
- rela->sec = ip_relasec;
+ reloc->type = R_X86_64_PC32;
+ reloc->offset = idx * sizeof(int);
+ reloc->sec = ip_relocsec;
- elf_add_rela(elf, rela);
+ elf_add_reloc(elf, reloc);
return 0;
}
@@ -137,7 +137,7 @@ static int create_orc_entry(struct elf *elf, struct section *u_sec, struct secti
int create_orc_sections(struct objtool_file *file)
{
struct instruction *insn, *prev_insn;
- struct section *sec, *u_sec, *ip_relasec;
+ struct section *sec, *u_sec, *ip_relocsec;
unsigned int idx;
struct orc_entry empty = {
@@ -181,8 +181,8 @@ int create_orc_sections(struct objtool_file *file)
if (!sec)
return -1;
- ip_relasec = elf_create_rela_section(file->elf, sec);
- if (!ip_relasec)
+ ip_relocsec = elf_create_reloc_section(file->elf, sec, SHT_RELA);
+ if (!ip_relocsec)
return -1;
/* create .orc_unwind section */
@@ -200,7 +200,7 @@ int create_orc_sections(struct objtool_file *file)
if (!prev_insn || memcmp(&insn->orc, &prev_insn->orc,
sizeof(struct orc_entry))) {
- if (create_orc_entry(file->elf, u_sec, ip_relasec, idx,
+ if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx,
insn->sec, insn->offset,
&insn->orc))
return -1;
@@ -212,7 +212,7 @@ int create_orc_sections(struct objtool_file *file)
/* section terminator */
if (prev_insn) {
- if (create_orc_entry(file->elf, u_sec, ip_relasec, idx,
+ if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx,
prev_insn->sec,
prev_insn->offset + prev_insn->len,
&empty))
@@ -222,7 +222,7 @@ int create_orc_sections(struct objtool_file *file)
}
}
- if (elf_rebuild_rela_section(ip_relasec))
+ if (elf_rebuild_reloc_section(file->elf, ip_relocsec))
return -1;
return 0;
diff --git a/tools/objtool/special.c b/tools/objtool/special.c
index e74e0189de22..e893f1e48e44 100644
--- a/tools/objtool/special.c
+++ b/tools/objtool/special.c
@@ -72,7 +72,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry,
struct section *sec, int idx,
struct special_alt *alt)
{
- struct rela *orig_rela, *new_rela;
+ struct reloc *orig_reloc, *new_reloc;
unsigned long offset;
offset = idx * entry->size;
@@ -118,30 +118,30 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry,
}
}
- orig_rela = find_rela_by_dest(elf, sec, offset + entry->orig);
- if (!orig_rela) {
- WARN_FUNC("can't find orig rela", sec, offset + entry->orig);
+ orig_reloc = find_reloc_by_dest(elf, sec, offset + entry->orig);
+ if (!orig_reloc) {
+ WARN_FUNC("can't find orig reloc", sec, offset + entry->orig);
return -1;
}
- if (orig_rela->sym->type != STT_SECTION) {
- WARN_FUNC("don't know how to handle non-section rela symbol %s",
- sec, offset + entry->orig, orig_rela->sym->name);
+ if (orig_reloc->sym->type != STT_SECTION) {
+ WARN_FUNC("don't know how to handle non-section reloc symbol %s",
+ sec, offset + entry->orig, orig_reloc->sym->name);
return -1;
}
- alt->orig_sec = orig_rela->sym->sec;
- alt->orig_off = orig_rela->addend;
+ alt->orig_sec = orig_reloc->sym->sec;
+ alt->orig_off = orig_reloc->addend;
if (!entry->group || alt->new_len) {
- new_rela = find_rela_by_dest(elf, sec, offset + entry->new);
- if (!new_rela) {
- WARN_FUNC("can't find new rela",
+ new_reloc = find_reloc_by_dest(elf, sec, offset + entry->new);
+ if (!new_reloc) {
+ WARN_FUNC("can't find new reloc",
sec, offset + entry->new);
return -1;
}
- alt->new_sec = new_rela->sym->sec;
- alt->new_off = (unsigned int)new_rela->addend;
+ alt->new_sec = new_reloc->sym->sec;
+ alt->new_off = (unsigned int)new_reloc->addend;
/* _ASM_EXTABLE_EX hack */
if (alt->new_off >= 0x7ffffff0)
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c
index 15a329da59fa..9a4349813a30 100644
--- a/tools/testing/selftests/x86/fsgsbase.c
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -489,11 +489,27 @@ static void test_ptrace_write_gsbase(void)
* selector value is changed or not by the GSBASE write in
* a ptracer.
*/
- if (gs == 0 && base == 0xFF) {
- printf("[OK]\tGS was reset as expected\n");
- } else {
+ if (gs != *shared_scratch) {
nerrs++;
- printf("[FAIL]\tGS=0x%lx, GSBASE=0x%lx (should be 0, 0xFF)\n", gs, base);
+ printf("[FAIL]\tGS changed to %lx\n", gs);
+
+ /*
+ * On older kernels, poking a nonzero value into the
+ * base would zero the selector. On newer kernels,
+ * this behavior has changed -- poking the base
+ * changes only the base and, if FSGSBASE is not
+ * available, this may not effect.
+ */
+ if (gs == 0)
+ printf("\tNote: this is expected behavior on older kernels.\n");
+ } else if (have_fsgsbase && (base != 0xFF)) {
+ nerrs++;
+ printf("[FAIL]\tGSBASE changed to %lx\n", base);
+ } else {
+ printf("[OK]\tGS remained 0x%hx", *shared_scratch);
+ if (have_fsgsbase)
+ printf(" and GSBASE changed to 0xFF");
+ printf("\n");
}
}
diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c
index bc0ecc2e862e..62fba40866d5 100644
--- a/tools/testing/selftests/x86/syscall_arg_fault.c
+++ b/tools/testing/selftests/x86/syscall_arg_fault.c
@@ -72,6 +72,7 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
if (ax != -EFAULT && ax != -ENOSYS) {
printf("[FAIL]\tAX had the wrong value: 0x%lx\n",
(unsigned long)ax);
+ printf("\tIP = 0x%lx\n", (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
n_errs++;
} else {
printf("[OK]\tSeems okay\n");
@@ -226,5 +227,30 @@ int main()
}
set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+#ifdef __x86_64__
+ printf("[RUN]\tSYSENTER with TF, invalid state, and GSBASE < 0\n");
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ sigtrap_consecutive_syscalls = 0;
+
+ asm volatile ("wrgsbase %%rax\n\t"
+ :: "a" (0xffffffffffff0000UL));
+
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ asm volatile (
+ "movl $-1, %%eax\n\t"
+ "movl $-1, %%ebx\n\t"
+ "movl $-1, %%ecx\n\t"
+ "movl $-1, %%edx\n\t"
+ "movl $-1, %%esi\n\t"
+ "movl $-1, %%edi\n\t"
+ "movl $-1, %%ebp\n\t"
+ "movl $-1, %%esp\n\t"
+ "sysenter"
+ : : : "memory", "flags");
+ }
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+#endif
+
return 0;
}