diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2020-06-24 12:39:50 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2020-06-24 12:39:51 +1000 |
commit | e3e5f7f48c27ddba468ad67f06b742e78d7e5ab6 (patch) | |
tree | d1ad683fb6a7780b879d7baaa4579e4d85b111be | |
parent | 5b3cfd00355b17e4465669a5c50e984f1c37f2b8 (diff) | |
parent | 9f2a2b6c23758d798f548911f6119def723d1029 (diff) | |
download | linux-next-e3e5f7f48c27ddba468ad67f06b742e78d7e5ab6.tar.gz |
Merge remote-tracking branch 'tip/auto-latest'
154 files changed, 3095 insertions, 1013 deletions
diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping new file mode 100644 index 000000000000..490ccfd67f12 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-mapping @@ -0,0 +1,33 @@ +What: /sys/devices/uncore_iio_x/dieX +Date: February 2020 +Contact: Roman Sudarikov <roman.sudarikov@linux.intel.com> +Description: + Each IIO stack (PCIe root port) has its own IIO PMON block, so + each dieX file (where X is die number) holds "Segment:Root Bus" + for PCIe root port, which can be monitored by that IIO PMON + block. + For example, on 4-die Xeon platform with up to 6 IIO stacks per + die and, therefore, 6 IIO PMON blocks per die, the mapping of + IIO PMON block 0 exposes as the following: + + $ ls /sys/devices/uncore_iio_0/die* + -r--r--r-- /sys/devices/uncore_iio_0/die0 + -r--r--r-- /sys/devices/uncore_iio_0/die1 + -r--r--r-- /sys/devices/uncore_iio_0/die2 + -r--r--r-- /sys/devices/uncore_iio_0/die3 + + $ tail /sys/devices/uncore_iio_0/die* + ==> /sys/devices/uncore_iio_0/die0 <== + 0000:00 + ==> /sys/devices/uncore_iio_0/die1 <== + 0000:40 + ==> /sys/devices/uncore_iio_0/die2 <== + 0000:80 + ==> /sys/devices/uncore_iio_0/die3 <== + 0000:c0 + + Which means: + IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x0000 + IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000 + IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000 + IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b5beb1fe78cf..bd6cd95e9a90 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3081,6 +3081,8 @@ no5lvl [X86-64] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. + nofsgsbase [X86] Disables FSGSBASE instructions. + no_console_suspend [HW] Never suspend the console Disable suspending of consoles during suspend and diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst index ce4bbd918648..b38379f06194 100644 --- a/Documentation/dev-tools/kcsan.rst +++ b/Documentation/dev-tools/kcsan.rst @@ -114,12 +114,6 @@ the below options are available: To dynamically limit for which functions to generate reports, see the `DebugFS interface`_ blacklist/whitelist feature. - For ``__always_inline`` functions, replace ``__always_inline`` with - ``__no_kcsan_or_inline`` (which implies ``__always_inline``):: - - static __no_kcsan_or_inline void foo(void) { - ... - * To disable data race detection for a particular compilation unit, add to the ``Makefile``:: diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/x86/x86_64/fsgs.rst new file mode 100644 index 000000000000..50960e09e1f6 --- /dev/null +++ b/Documentation/x86/x86_64/fsgs.rst @@ -0,0 +1,199 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Using FS and GS segments in user space applications +=================================================== + +The x86 architecture supports segmentation. Instructions which access +memory can use segment register based addressing mode. The following +notation is used to address a byte within a segment: + + Segment-register:Byte-address + +The segment base address is added to the Byte-address to compute the +resulting virtual address which is accessed. This allows to access multiple +instances of data with the identical Byte-address, i.e. the same code. The +selection of a particular instance is purely based on the base-address in +the segment register. + +In 32-bit mode the CPU provides 6 segments, which also support segment +limits. The limits can be used to enforce address space protections. + +In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is +always 0 to provide a full 64bit address space. The FS and GS segments are +still functional in 64-bit mode. + +Common FS and GS usage +------------------------------ + +The FS segment is commonly used to address Thread Local Storage (TLS). FS +is usually managed by runtime code or a threading library. Variables +declared with the '__thread' storage class specifier are instantiated per +thread and the compiler emits the FS: address prefix for accesses to these +variables. Each thread has its own FS base address so common code can be +used without complex address offset calculations to access the per thread +instances. Applications should not use FS for other purposes when they use +runtimes or threading libraries which manage the per thread FS. + +The GS segment has no common use and can be used freely by +applications. GCC and Clang support GS based addressing via address space +identifiers. + +Reading and writing the FS/GS base address +------------------------------------------ + +There exist two mechanisms to read and write the FS/GS base address: + + - the arch_prctl() system call + + - the FSGSBASE instruction family + +Accessing FS/GS base with arch_prctl() +-------------------------------------- + + The arch_prctl(2) based mechanism is available on all 64-bit CPUs and all + kernel versions. + + Reading the base: + + arch_prctl(ARCH_GET_FS, &fsbase); + arch_prctl(ARCH_GET_GS, &gsbase); + + Writing the base: + + arch_prctl(ARCH_SET_FS, fsbase); + arch_prctl(ARCH_SET_GS, gsbase); + + The ARCH_SET_GS prctl may be disabled depending on kernel configuration + and security settings. + +Accessing FS/GS base with the FSGSBASE instructions +--------------------------------------------------- + + With the Ivy Bridge CPU generation Intel introduced a new set of + instructions to access the FS and GS base registers directly from user + space. These instructions are also supported on AMD Family 17H CPUs. The + following instructions are available: + + =============== =========================== + RDFSBASE %reg Read the FS base register + RDGSBASE %reg Read the GS base register + WRFSBASE %reg Write the FS base register + WRGSBASE %reg Write the GS base register + =============== =========================== + + The instructions avoid the overhead of the arch_prctl() syscall and allow + more flexible usage of the FS/GS addressing modes in user space + applications. This does not prevent conflicts between threading libraries + and runtimes which utilize FS and applications which want to use it for + their own purpose. + +FSGSBASE instructions enablement +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If + available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs. + + The availability of the instructions does not enable them + automatically. The kernel has to enable them explicitly in CR4. The + reason for this is that older kernels make assumptions about the values in + the GS register and enforce them when GS base is set via + arch_prctl(). Allowing user space to write arbitrary values to GS base + would violate these assumptions and cause malfunction. + + On kernels which do not enable FSGSBASE the execution of the FSGSBASE + instructions will fault with a #UD exception. + + The kernel provides reliable information about the enabled state in the + ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the + kernel has FSGSBASE instructions enabled and applications can use them. + The following code example shows how this detection works:: + + #include <sys/auxv.h> + #include <elf.h> + + /* Will be eventually in asm/hwcap.h */ + #ifndef HWCAP2_FSGSBASE + #define HWCAP2_FSGSBASE (1 << 1) + #endif + + .... + + unsigned val = getauxval(AT_HWCAP2); + + if (val & HWCAP2_FSGSBASE) + printf("FSGSBASE enabled\n"); + +FSGSBASE instructions compiler support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE +instructions. Clang 5 supports them as well. + + =================== =========================== + _readfsbase_u64() Read the FS base register + _readfsbase_u64() Read the GS base register + _writefsbase_u64() Write the FS base register + _writegsbase_u64() Write the GS base register + =================== =========================== + +To utilize these instrinsics <immintrin.h> must be included in the source +code and the compiler option -mfsgsbase has to be added. + +Compiler support for FS/GS based addressing +------------------------------------------- + +GCC version 6 and newer provide support for FS/GS based addressing via +Named Address Spaces. GCC implements the following address space +identifiers for x86: + + ========= ==================================== + __seg_fs Variable is addressed relative to FS + __seg_gs Variable is addressed relative to GS + ========= ==================================== + +The preprocessor symbols __SEG_FS and __SEG_GS are defined when these +address spaces are supported. Code which implements fallback modes should +check whether these symbols are defined. Usage example:: + + #ifdef __SEG_GS + + long data0 = 0; + long data1 = 1; + + long __seg_gs *ptr; + + /* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */ + .... + + /* Set GS base to point to data0 */ + _writegsbase_u64(&data0); + + /* Access offset 0 of GS */ + ptr = 0; + printf("data0 = %ld\n", *ptr); + + /* Set GS base to point to data1 */ + _writegsbase_u64(&data1); + /* ptr still addresses offset 0! */ + printf("data1 = %ld\n", *ptr); + + +Clang does not provide the GCC address space identifiers, but it provides +address spaces via an attribute based mechanism in Clang 2.6 and newer +versions: + + ==================================== ===================================== + __attribute__((address_space(256)) Variable is addressed relative to GS + __attribute__((address_space(257)) Variable is addressed relative to FS + ==================================== ===================================== + +FS/GS based addressing with inline assembly +------------------------------------------- + +In case the compiler does not support address spaces, inline assembly can +be used for FS/GS based addressing mode:: + + mov %fs:offset, %reg + mov %gs:offset, %reg + + mov %reg, %fs:offset + mov %reg, %gs:offset diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/x86/x86_64/index.rst index d6eaaa5a35fc..a56070fc8e77 100644 --- a/Documentation/x86/x86_64/index.rst +++ b/Documentation/x86/x86_64/index.rst @@ -14,3 +14,4 @@ x86_64 Support fake-numa-for-cpusets cpu-hotplug-spec machinecheck + fsgs diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 746e1fce777e..9a9aa53547a6 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -270,12 +270,11 @@ static struct bL_thread bL_threads[NR_CPUS]; static int bL_switcher_thread(void *arg) { struct bL_thread *t = arg; - struct sched_param param = { .sched_priority = 1 }; int cluster; bL_switch_completion_handler completer; void *completer_cookie; - sched_setscheduler_nocheck(current, SCHED_FIFO, ¶m); + sched_set_fifo_low(current); complete(&t->started); do { diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index 84dc0ba822f5..5dcf3c6011b7 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -87,4 +87,11 @@ static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base, return dram_base + SZ_512M; } +struct efi_arm_entry_state { + u32 cpsr_before_ebs; + u32 sctlr_before_ebs; + u32 cpsr_after_ebs; + u32 sctlr_after_ebs; +}; + #endif /* _ASM_ARM_EFI_H */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6a0cc524882d..0fe0446102a7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -67,7 +67,7 @@ config X86 select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_KCOV if X86_64 + select ARCH_HAS_KCOV if X86_64 && STACK_VALIDATION select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE @@ -1292,7 +1292,6 @@ config MICROCODE bool "CPU microcode loading support" default y depends on CPU_SUP_AMD || CPU_SUP_INTEL - select FW_LOADER help If you say Y here, you will be able to update the microcode on Intel and AMD processors. The Intel support is for the IA32 family, @@ -1314,7 +1313,6 @@ config MICROCODE_INTEL bool "Intel microcode loading support" depends on MICROCODE default MICROCODE - select FW_LOADER help This options enables microcode patch loading support for Intel processors. @@ -1326,7 +1324,6 @@ config MICROCODE_INTEL config MICROCODE_AMD bool "AMD microcode loading support" depends on MICROCODE - select FW_LOADER help If you select this option, microcode patch loading support for AMD processors will be enabled. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index e821a7d7d5c4..97d37f0a34f5 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -213,7 +213,6 @@ SYM_FUNC_START(startup_32) * We place all of the values on our mini stack so lret can * used to perform that far jump. */ - pushl $__KERNEL_CS leal startup_64(%ebp), %eax #ifdef CONFIG_EFI_MIXED movl efi32_boot_args(%ebp), %edi @@ -224,11 +223,20 @@ SYM_FUNC_START(startup_32) movl efi32_boot_args+8(%ebp), %edx // saved bootparams pointer cmpl $0, %edx jnz 1f + /* + * efi_pe_entry uses MS calling convention, which requires 32 bytes of + * shadow space on the stack even if all arguments are passed in + * registers. We also need an additional 8 bytes for the space that + * would be occupied by the return address, and this also results in + * the correct stack alignment for entry. + */ + subl $40, %esp leal efi_pe_entry(%ebp), %eax movl %edi, %ecx // MS calling convention movl %esi, %edx 1: #endif + pushl $__KERNEL_CS pushl %eax /* Enter paged protected Mode, activating Long Mode */ @@ -784,6 +792,7 @@ SYM_DATA_LOCAL(boot_heap, .fill BOOT_HEAP_SIZE, 1, 0) SYM_DATA_START_LOCAL(boot_stack) .fill BOOT_STACK_SIZE, 1, 0 + .balign 16 SYM_DATA_END_LABEL(boot_stack, SYM_L_LOCAL, boot_stack_end) /* diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 4208c1e3f601..98e4d8886f11 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,6 +6,7 @@ #include <asm/percpu.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> +#include <asm/inst.h> /* @@ -341,6 +342,12 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm +.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req + rdgsbase \save_reg + GET_PERCPU_BASE \scratch_reg + wrgsbase \scratch_reg +.endm + #else /* CONFIG_X86_64 */ # undef UNWIND_HINT_IRET_REGS # define UNWIND_HINT_IRET_REGS @@ -351,3 +358,36 @@ For 32-bit we have the following conventions - kernel is built with call stackleak_erase #endif .endm + +#ifdef CONFIG_SMP + +/* + * CPU/node NR is loaded from the limit (size) field of a special segment + * descriptor entry in GDT. + */ +.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req + movq $__CPUNODE_SEG, \reg + lsl \reg, \reg +.endm + +/* + * Fetch the per-CPU GSBASE value for this processor and put it in @reg. + * We normally use %gs for accessing per-CPU data, but we are setting up + * %gs here and obviously can not use %gs itself to access per-CPU data. + */ +.macro GET_PERCPU_BASE reg:req + ALTERNATIVE \ + "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \ + "RDPID \reg", \ + X86_FEATURE_RDPID + andq $VDSO_CPUNODE_MASK, \reg + movq __per_cpu_offset(, \reg, 8), \reg +.endm + +#else + +.macro GET_PERCPU_BASE reg:req + movq pcpu_unit_offsets(%rip), \reg +.endm + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d2a00c97e53f..fb729f4c4fbc 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,6 +38,7 @@ #include <asm/frame.h> #include <asm/trapnr.h> #include <asm/nospec-branch.h> +#include <asm/fsgsbase.h> #include <linux/err.h> #include "calling.h" @@ -426,10 +427,7 @@ SYM_CODE_START(\asmsym) testb $3, CS-ORIG_RAX(%rsp) jnz .Lfrom_usermode_switch_stack_\@ - /* - * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. - * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS - */ + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ call paranoid_entry UNWIND_HINT_REGS @@ -458,10 +456,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=8 ASM_CLAC - /* - * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. - * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS - */ + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ call paranoid_entry UNWIND_HINT_REGS @@ -798,24 +793,21 @@ SYM_CODE_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ /* - * Save all registers in pt_regs, and switch gs if needed. - * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Save all registers in pt_regs. Return GSBASE related information + * in EBX depending on the availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit + * + * Y GSBASE value at entry, must be restored in paranoid_exit */ SYM_CODE_START_LOCAL(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - movl $1, %ebx - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx, %ebx -1: /* * Always stash CR3 in %r14. This value will be restored, * verbatim, at exit. Needed if paranoid_entry interrupted @@ -825,16 +817,60 @@ SYM_CODE_START_LOCAL(paranoid_entry) * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return * to kernel code, but with a user CR3 value. + * + * Switching CR3 does not depend on kernel GSBASE so it can + * be done before switching to the kernel GSBASE. This is + * required for FSGSBASE because the kernel GSBASE has to + * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 /* + * Handling GSBASE depends on the availability of FSGSBASE. + * + * Without FSGSBASE the kernel enforces that negative GSBASE + * values indicate kernel GSBASE. With FSGSBASE no assumptions + * can be made about the GSBASE value when entering from user + * space. + */ + ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE + + /* + * Read the current GSBASE and store it in %rbx unconditionally, + * retrieve and set the current CPUs kernel GSBASE. The stored value + * has to be restored in paranoid_exit unconditionally. + * + * The MSR write ensures that no subsequent load is based on a + * mispredicted GSBASE. No extra FENCE required. + */ + SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx + ret + +.Lparanoid_entry_checkgs: + /* EBX = 1 -> kernel GSBASE active, no restore required */ + movl $1, %ebx + /* + * The kernel-enforced convention is a negative GSBASE indicates + * a kernel value. No SWAPGS needed on entry and exit. + */ + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + jns .Lparanoid_entry_swapgs + ret + +.Lparanoid_entry_swapgs: + SWAPGS + + /* * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an * unconditional CR3 write, even in the PTI case. So do an lfence * to prevent GS speculation, regardless of whether PTI is enabled. */ FENCE_SWAPGS_KERNEL_ENTRY + /* EBX = 0 -> SWAPGS required on exit */ + xorl %ebx, %ebx ret SYM_CODE_END(paranoid_entry) @@ -845,23 +881,45 @@ SYM_CODE_END(paranoid_entry) * * We may be returning to very strange contexts (e.g. very early * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. + * be complicated. Fortunately, there's no good reason to try + * to handle preemption here. + * + * R/EBX contains the GSBASE related information depending on the + * availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit * - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) + * Y User space GSBASE, must be restored unconditionally */ SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS - testl %ebx, %ebx /* swapgs needed? */ - jnz .Lparanoid_exit_no_swapgs - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 + /* + * The order of operations is important. RESTORE_CR3 requires + * kernel GSBASE. + * + * NB to anyone to try to optimize this code: this code does + * not execute at all for exceptions from user mode. Those + * exceptions go through error_exit instead. + */ + RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + + /* Handle the three GSBASE cases */ + ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE + + /* With FSGSBASE enabled, unconditionally restore GSBASE */ + wrgsbase %rbx + jmp restore_regs_and_return_to_kernel + +.Lparanoid_exit_checkgs: + /* On non-FSGSBASE systems, conditionally do SWAPGS */ + testl %ebx, %ebx + jnz restore_regs_and_return_to_kernel + + /* We are returning to a context with user GSBASE */ SWAPGS_UNSAFE_STACK - jmp restore_regs_and_return_to_kernel -.Lparanoid_exit_no_swapgs: - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - jmp restore_regs_and_return_to_kernel + jmp restore_regs_and_return_to_kernel SYM_CODE_END(paranoid_exit) /* @@ -1266,10 +1324,27 @@ end_repeat_nmi: /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - testl %ebx, %ebx /* swapgs needed? */ + /* + * The above invocation of paranoid_entry stored the GSBASE + * related information in R/EBX depending on the availability + * of FSGSBASE. + * + * If FSGSBASE is enabled, restore the saved GSBASE value + * unconditionally, otherwise take the conditional SWAPGS path. + */ + ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE + + wrgsbase %rbx + jmp nmi_restore + +nmi_no_fsgsbase: + /* EBX == 0 -> invoke SWAPGS */ + testl %ebx, %ebx jnz nmi_restore + nmi_swapgs: SWAPGS_UNSAFE_STACK + nmi_restore: POP_REGS diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 12c42eba77ec..9933c0e8e97a 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += core.o probe.o -obj-$(PERF_EVENTS_INTEL_RAPL) += rapl.o +obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += rapl.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index 43b09e9c93a2..16a2369c586e 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -13,10 +13,6 @@ #include <asm/cpu_device_id.h> #include "../perf_event.h" -#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a -#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b -#define MSR_F15H_PTSC 0xc0010280 - /* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */ #define AMD_POWER_EVENT_MASK 0xFFULL diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index cf76d6631afa..d5c6d3b340c5 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -16,7 +16,7 @@ struct pci_driver *uncore_pci_driver; DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_extra_dev *uncore_extra_pci_dev; -static int max_dies; +int __uncore_max_dies; /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -108,7 +108,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ - return dieid < max_dies ? pmu->boxes[dieid] : NULL; + return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -132,6 +132,9 @@ u64 uncore_mmio_read_counter(struct intel_uncore_box *box, if (!box->io_addr) return 0; + if (!uncore_mmio_is_valid_offset(box, event->hw.event_base)) + return 0; + return readq(box->io_addr + event->hw.event_base); } @@ -843,10 +846,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .read = uncore_pmu_event_read, .module = THIS_MODULE, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .attr_update = pmu->type->attr_update, }; } else { pmu->pmu = *pmu->type->pmu; pmu->pmu.attr_groups = pmu->type->attr_groups; + pmu->pmu.attr_update = pmu->type->attr_update; } if (pmu->type->num_boxes == 1) { @@ -877,7 +882,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { int die; - for (die = 0; die < max_dies; die++) + for (die = 0; die < uncore_max_dies(); die++) kfree(pmu->boxes[die]); kfree(pmu->boxes); } @@ -887,6 +892,9 @@ static void uncore_type_exit(struct intel_uncore_type *type) struct intel_uncore_pmu *pmu = type->pmus; int i; + if (type->cleanup_mapping) + type->cleanup_mapping(type); + if (pmu) { for (i = 0; i < type->num_boxes; i++, pmu++) { uncore_pmu_unregister(pmu); @@ -915,7 +923,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) if (!pmus) return -ENOMEM; - size = max_dies * sizeof(struct intel_uncore_box *); + size = uncore_max_dies() * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { pmus[i].func_id = setid ? i : -1; @@ -954,6 +962,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) type->pmu_group = &uncore_pmu_attr_group; + if (type->set_mapping) + type->set_mapping(type); + return 0; err: @@ -1112,7 +1123,7 @@ static int __init uncore_pci_init(void) size_t size; int ret; - size = max_dies * sizeof(struct pci_extra_dev); + size = uncore_max_dies() * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { ret = -ENOMEM; @@ -1514,6 +1525,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init), @@ -1539,7 +1552,8 @@ static int __init intel_uncore_init(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; - max_dies = topology_max_packages() * topology_max_die_per_package(); + __uncore_max_dies = + topology_max_packages() * topology_max_die_per_package(); uncore_init = (struct intel_uncore_init_fun *)id->driver_data; if (uncore_init->pci_init) { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index b469ddd45515..105fdc69825e 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -61,6 +61,7 @@ struct intel_uncore_type { unsigned msr_offset; unsigned mmio_offset; }; + unsigned mmio_map_size; unsigned num_shared_regs:8; unsigned single_fixed:1; unsigned pair_ctr_ctl:1; @@ -72,7 +73,19 @@ struct intel_uncore_type { struct uncore_event_desc *event_descs; struct freerunning_counters *freerunning; const struct attribute_group *attr_groups[4]; + const struct attribute_group **attr_update; struct pmu *pmu; /* for custom pmu ops */ + /* + * Uncore PMU would store relevant platform topology configuration here + * to identify which platform component each PMON block of that type is + * supposed to monitor. + */ + u64 *topology; + /* + * Optional callbacks for managing mapping of Uncore units to PMONs + */ + int (*set_mapping)(struct intel_uncore_type *type); + void (*cleanup_mapping)(struct intel_uncore_type *type); }; #define pmu_group attr_groups[0] @@ -169,6 +182,18 @@ int uncore_pcibus_to_physid(struct pci_bus *bus); ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); +static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev) +{ + return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu); +} + +#define to_device_attribute(n) container_of(n, struct device_attribute, attr) +#define to_dev_ext_attribute(n) container_of(n, struct dev_ext_attribute, attr) +#define attr_to_ext_attr(n) to_dev_ext_attribute(to_device_attribute(n)) + +extern int __uncore_max_dies; +#define uncore_max_dies() (__uncore_max_dies) + #define INTEL_UNCORE_EVENT_DESC(_name, _config) \ { \ .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ @@ -196,6 +221,18 @@ static inline bool uncore_pmc_freerunning(int idx) return idx == UNCORE_PMC_IDX_FREERUNNING; } +static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box, + unsigned long offset) +{ + if (offset < box->pmu->type->mmio_map_size) + return true; + + pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n", + offset, box->pmu->type->name); + + return false; +} + static inline unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box) { diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 3de1065eefc4..cb94ba86efd2 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -42,6 +42,17 @@ #define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC 0x3ed0 #define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC 0x3e34 #define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35 +#define PCI_DEVICE_ID_INTEL_CML_H1_IMC 0x9b44 +#define PCI_DEVICE_ID_INTEL_CML_H2_IMC 0x9b54 +#define PCI_DEVICE_ID_INTEL_CML_H3_IMC 0x9b64 +#define PCI_DEVICE_ID_INTEL_CML_U1_IMC 0x9b51 +#define PCI_DEVICE_ID_INTEL_CML_U2_IMC 0x9b61 +#define PCI_DEVICE_ID_INTEL_CML_U3_IMC 0x9b71 +#define PCI_DEVICE_ID_INTEL_CML_S1_IMC 0x9b33 +#define PCI_DEVICE_ID_INTEL_CML_S2_IMC 0x9b43 +#define PCI_DEVICE_ID_INTEL_CML_S3_IMC 0x9b53 +#define PCI_DEVICE_ID_INTEL_CML_S4_IMC 0x9b63 +#define PCI_DEVICE_ID_INTEL_CML_S5_IMC 0x9b73 #define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 #define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 #define PCI_DEVICE_ID_INTEL_TGL_U1_IMC 0x9a02 @@ -415,6 +426,7 @@ static const struct attribute_group snb_uncore_imc_format_group = { static void snb_uncore_imc_init_box(struct intel_uncore_box *box) { + struct intel_uncore_type *type = box->pmu->type; struct pci_dev *pdev = box->pci_dev; int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; resource_size_t addr; @@ -430,7 +442,10 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box) addr &= ~(PAGE_SIZE - 1); - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); + box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; } @@ -586,6 +601,7 @@ static struct intel_uncore_type snb_uncore_imc = { .num_counters = 2, .num_boxes = 1, .num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNB_UNCORE_PCI_IMC_MAP_SIZE, .freerunning = snb_uncore_imc_freerunning, .event_descs = snb_uncore_imc_events, .format_group = &snb_uncore_imc_format_group, @@ -771,6 +787,50 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -863,6 +923,17 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Dual Core */ + IMC_DEV(CML_H1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_H2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_H3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S4_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver), IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ { /* end marker */ } @@ -1085,11 +1156,13 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void) } #define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000 +#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000 static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) { struct pci_dev *pdev = tgl_uncore_get_mc_dev(); struct intel_uncore_pmu *pmu = box->pmu; + struct intel_uncore_type *type = pmu->type; resource_size_t addr; u32 mch_bar; @@ -1112,7 +1185,9 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) addr |= ((resource_size_t)mch_bar << 32); #endif - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); } static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = { @@ -1138,6 +1213,7 @@ static struct intel_uncore_type tgl_uncore_imc_free_running = { .num_counters = 3, .num_boxes = 2, .num_freerunning_types = TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = TGL_UNCORE_PCI_IMC_MAP_SIZE, .freerunning = tgl_uncore_imc_freerunning, .ops = &tgl_uncore_imc_freerunning_ops, .event_descs = tgl_uncore_imc_events, diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 07652fa20ebb..62e88ad919ff 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -273,6 +273,30 @@ #define SKX_CPUNODEID 0xc0 #define SKX_GIDNIDMAP 0xd4 +/* + * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR + * that BIOS programmed. MSR has package scope. + * | Bit | Default | Description + * | [63] | 00h | VALID - When set, indicates the CPU bus + * numbers have been initialized. (RO) + * |[62:48]| --- | Reserved + * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned + * CPUBUSNO(5). (RO) + * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned + * CPUBUSNO(4). (RO) + * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned + * CPUBUSNO(3). (RO) + * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned + * CPUBUSNO(2). (RO) + * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned + * CPUBUSNO(1). (RO) + * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned + * CPUBUSNO(0). (RO) + */ +#define SKX_MSR_CPU_BUS_NUMBER 0x300 +#define SKX_MSR_CPU_BUS_VALID_BIT (1ULL << 63) +#define BUS_NUM_STRIDE 8 + /* SKX CHA */ #define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0) #define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9) @@ -3612,6 +3636,170 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { .read_counter = uncore_msr_read_counter, }; +static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) +{ + return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE); +} + +static umode_t +skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj)); + + /* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */ + return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode; +} + +static ssize_t skx_iio_mapping_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pci_bus *bus = pci_find_next_bus(NULL); + struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev); + struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); + long die = (long)ea->var; + + /* + * Current implementation is for single segment configuration hence it's + * safe to take the segment value from the first available root bus. + */ + return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus), + skx_iio_stack(uncore_pmu, die)); +} + +static int skx_msr_cpu_bus_read(int cpu, u64 *topology) +{ + u64 msr_value; + + if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) || + !(msr_value & SKX_MSR_CPU_BUS_VALID_BIT)) + return -ENXIO; + + *topology = msr_value; + + return 0; +} + +static int die_to_cpu(int die) +{ + int res = 0, cpu, current_die; + /* + * Using cpus_read_lock() to ensure cpu is not going down between + * looking at cpu_online_mask. + */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + current_die = topology_logical_die_id(cpu); + if (current_die == die) { + res = cpu; + break; + } + } + cpus_read_unlock(); + return res; +} + +static int skx_iio_get_topology(struct intel_uncore_type *type) +{ + int i, ret; + struct pci_bus *bus = NULL; + + /* + * Verified single-segment environments only; disabled for multiple + * segment topologies for now except VMD domains. + * VMD domains start at 0x10000 to not clash with ACPI _SEG domains. + */ + while ((bus = pci_find_next_bus(bus)) + && (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff)) + ; + if (bus) + return -EPERM; + + type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL); + if (!type->topology) + return -ENOMEM; + + for (i = 0; i < uncore_max_dies(); i++) { + ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]); + if (ret) { + kfree(type->topology); + type->topology = NULL; + return ret; + } + } + + return 0; +} + +static struct attribute_group skx_iio_mapping_group = { + .is_visible = skx_iio_mapping_visible, +}; + +static const struct attribute_group *skx_iio_attr_update[] = { + &skx_iio_mapping_group, + NULL, +}; + +static int skx_iio_set_mapping(struct intel_uncore_type *type) +{ + char buf[64]; + int ret; + long die = -1; + struct attribute **attrs = NULL; + struct dev_ext_attribute *eas = NULL; + + ret = skx_iio_get_topology(type); + if (ret) + return ret; + + /* One more for NULL. */ + attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); + if (!attrs) + goto err; + + eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL); + if (!eas) + goto err; + + for (die = 0; die < uncore_max_dies(); die++) { + sprintf(buf, "die%ld", die); + sysfs_attr_init(&eas[die].attr.attr); + eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL); + if (!eas[die].attr.attr.name) + goto err; + eas[die].attr.attr.mode = 0444; + eas[die].attr.show = skx_iio_mapping_show; + eas[die].attr.store = NULL; + eas[die].var = (void *)die; + attrs[die] = &eas[die].attr.attr; + } + skx_iio_mapping_group.attrs = attrs; + + return 0; +err: + for (; die >= 0; die--) + kfree(eas[die].attr.attr.name); + kfree(eas); + kfree(attrs); + kfree(type->topology); + type->attr_update = NULL; + return -ENOMEM; +} + +static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) +{ + struct attribute **attr = skx_iio_mapping_group.attrs; + + if (!attr) + return; + + for (; *attr; attr++) + kfree((*attr)->name); + kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs)); + kfree(skx_iio_mapping_group.attrs); + skx_iio_mapping_group.attrs = NULL; + kfree(type->topology); +} + static struct intel_uncore_type skx_uncore_iio = { .name = "iio", .num_counters = 4, @@ -3626,6 +3814,9 @@ static struct intel_uncore_type skx_uncore_iio = { .constraints = skx_uncore_iio_constraints, .ops = &skx_uncore_iio_ops, .format_group = &skx_uncore_iio_format_group, + .attr_update = skx_iio_attr_update, + .set_mapping = skx_iio_set_mapping, + .cleanup_mapping = skx_iio_cleanup_mapping, }; enum perf_uncore_iio_freerunning_type_id { @@ -4421,6 +4612,7 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, unsigned int box_ctl, int mem_offset) { struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid); + struct intel_uncore_type *type = box->pmu->type; resource_size_t addr; u32 pci_dword; @@ -4435,9 +4627,11 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, addr += box_ctl; - box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE); - if (!box->io_addr) + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) { + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); return; + } writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr); } @@ -4480,6 +4674,9 @@ static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box, if (!box->io_addr) return; + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) + return; + writel(hwc->config | SNBEP_PMON_CTL_EN, box->io_addr + hwc->config_base); } @@ -4492,6 +4689,9 @@ static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box, if (!box->io_addr) return; + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) + return; + writel(hwc->config, box->io_addr + hwc->config_base); } @@ -4530,6 +4730,7 @@ static struct intel_uncore_type snr_uncore_imc = { .event_mask = SNBEP_PMON_RAW_EVENT_MASK, .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, .mmio_offset = SNR_IMC_MMIO_OFFSET, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .ops = &snr_uncore_mmio_ops, .format_group = &skx_uncore_format_group, }; @@ -4570,6 +4771,7 @@ static struct intel_uncore_type snr_uncore_imc_free_running = { .num_counters = 3, .num_boxes = 1, .num_freerunning_types = SNR_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .freerunning = snr_imc_freerunning, .ops = &snr_uncore_imc_freerunning_ops, .event_descs = snr_uncore_imc_freerunning_events, @@ -4987,6 +5189,7 @@ static struct intel_uncore_type icx_uncore_imc = { .event_mask = SNBEP_PMON_RAW_EVENT_MASK, .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, .mmio_offset = SNR_IMC_MMIO_OFFSET, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .ops = &icx_uncore_mmio_ops, .format_group = &skx_uncore_format_group, }; @@ -5044,6 +5247,7 @@ static struct intel_uncore_type icx_uncore_imc_free_running = { .num_counters = 5, .num_boxes = 4, .num_freerunning_types = ICX_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .freerunning = icx_imc_freerunning, .ops = &icx_uncore_imc_freerunning_ops, .event_descs = icx_uncore_imc_freerunning_events, diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 0f63585edf5f..5c15f95b1ba7 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -144,7 +144,7 @@ _ASM_PTR (entry); \ .popsection -#else +#else /* ! __ASSEMBLY__ */ # define _EXPAND_EXTABLE_HANDLE(x) #x # define _ASM_EXTABLE_HANDLE(from, to, handler) \ " .pushsection \"__ex_table\",\"a\"\n" \ @@ -164,9 +164,7 @@ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) /* For C file, we already have NOKPROBE_SYMBOL macro */ -#endif -#ifndef __ASSEMBLY__ /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer @@ -175,6 +173,6 @@ */ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) -#endif +#endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 35460fef39b8..0367efdc5b7a 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -201,12 +201,8 @@ arch_test_and_change_bit(long nr, volatile unsigned long *addr) return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr); } -static __no_kcsan_or_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) +static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) { - /* - * Because this is a plain access, we need to disable KCSAN here to - * avoid double instrumentation via instrumented bitops. - */ return ((1UL << (nr & (BITS_PER_LONG-1))) & (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index fb34ff641e0a..028189575560 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -75,6 +75,12 @@ do { \ unreachable(); \ } while (0) +/* + * This instrumentation_begin() is strictly speaking incorrect; but it + * suppresses the complaints from WARN()s in noinstr code. If such a WARN() + * were to trigger, we'd rather wreck the machine in an attempt to get the + * message out than not know about it. + */ #define __WARN_FLAGS(flags) \ do { \ instrumentation_begin(); \ diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index dd17c2da1af5..da78ccbd493b 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -58,4 +58,9 @@ static inline bool handle_guest_split_lock(unsigned long ip) return false; } #endif +#ifdef CONFIG_IA32_FEAT_CTL +void init_ia32_feat_ctl(struct cpuinfo_x86 *c); +#else +static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {} +#endif #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 02dabc9e77b0..c693ebf32a15 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -96,6 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ +/* free ( 3*32+17) */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -107,6 +108,7 @@ #define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ +/* free ( 3*32+29) */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ #define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h index 6722ffcef2e6..3afa990d756b 100644 --- a/arch/x86/include/asm/cpumask.h +++ b/arch/x86/include/asm/cpumask.h @@ -11,5 +11,23 @@ extern cpumask_var_t cpu_sibling_setup_mask; extern void setup_cpu_local_masks(void); +/* + * NMI and MCE exceptions need cpu_is_offline() _really_ early, + * provide an arch_ special for them to avoid instrumentation. + */ +#if NR_CPUS > 1 +static __always_inline bool arch_cpu_online(int cpu) +{ + return arch_test_bit(cpu, cpumask_bits(cpu_online_mask)); +} +#else +static __always_inline bool arch_cpu_online(int cpu) +{ + return cpu == 0; +} +#endif + +#define arch_cpu_is_offline(cpu) unlikely(!arch_cpu_online(cpu)) + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_CPUMASK_H */ diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index 9b8cb50768c2..b8f1dc0761e4 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -74,16 +74,26 @@ static inline u64 mul_u32_u32(u32 a, u32 b) #else # include <asm-generic/div64.h> -static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) +/* + * Will generate an #DE when the result doesn't fit u64, could fix with an + * __ex_table[] entry when it becomes an issue. + */ +static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div) { u64 q; asm ("mulq %2; divq %3" : "=a" (q) - : "a" (a), "rm" ((u64)mul), "rm" ((u64)div) + : "a" (a), "rm" (mul), "rm" (div) : "rdx"); return q; } +#define mul_u64_u64_div_u64 mul_u64_u64_div_u64 + +static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) +{ + return mul_u64_u64_div_u64(a, mul, div); +} #define mul_u64_u32_div mul_u64_u32_div #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index bca4c743de77..aefd53767a5d 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -19,36 +19,63 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task); extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); -/* Helper functions for reading/writing FS/GS base */ +/* Must be protected by X86_FEATURE_FSGSBASE check. */ -static inline unsigned long x86_fsbase_read_cpu(void) +static __always_inline unsigned long rdfsbase(void) { unsigned long fsbase; - rdmsrl(MSR_FS_BASE, fsbase); + asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); return fsbase; } -static inline unsigned long x86_gsbase_read_cpu_inactive(void) +static __always_inline unsigned long rdgsbase(void) { unsigned long gsbase; - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); return gsbase; } -static inline void x86_fsbase_write_cpu(unsigned long fsbase) +static __always_inline void wrfsbase(unsigned long fsbase) { - wrmsrl(MSR_FS_BASE, fsbase); + asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); } -static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +static __always_inline void wrgsbase(unsigned long gsbase) { - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); } +#include <asm/cpufeature.h> + +/* Helper functions for reading/writing FS/GS base */ + +static inline unsigned long x86_fsbase_read_cpu(void) +{ + unsigned long fsbase; + + if (static_cpu_has(X86_FEATURE_FSGSBASE)) + fsbase = rdfsbase(); + else + rdmsrl(MSR_FS_BASE, fsbase); + + return fsbase; +} + +static inline void x86_fsbase_write_cpu(unsigned long fsbase) +{ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) + wrfsbase(fsbase); + else + wrmsrl(MSR_FS_BASE, fsbase); +} + +extern unsigned long x86_gsbase_read_cpu_inactive(void); +extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); + #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index f5a796da07f8..d063841a17e3 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -306,6 +306,21 @@ .endif MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 .endm + +.macro RDPID opd + REG_TYPE rdpid_opd_type \opd + .if rdpid_opd_type == REG_TYPE_R64 + R64_NUM rdpid_opd \opd + .else + R32_NUM rdpid_opd \opd + .endif + .byte 0xf3 + .if rdpid_opd > 7 + PFX_REX rdpid_opd 0 + .endif + .byte 0x0f, 0xc7 + MODRM 0xc0 rdpid_opd 0x7 +.endm #endif #endif diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 073eb7ad2f56..143bc9abe99c 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -66,6 +66,8 @@ struct arch_specific_insn { */ bool boostable; bool if_modifier; + /* Number of bytes of text poked */ + int tp_len; }; struct arch_optimized_insn { diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 848ce43b9040..5049f6c22683 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -43,9 +43,10 @@ void __init sme_enable(struct boot_params *bp); int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); +void __init mem_encrypt_free_decrypted_mem(void); + /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); -void __init mem_encrypt_free_decrypted_mem(void); bool sme_active(void); bool sev_active(void); @@ -77,6 +78,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +static inline void mem_encrypt_free_decrypted_mem(void) { } + #define __bss_decrypted #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e8370e64a155..63ed8fe35738 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -418,7 +418,6 @@ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_TSC_RATIO 0xc0000104 #define MSR_AMD64_NB_CFG 0xc001001f -#define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD_PERF_CTL 0xc0010062 #define MSR_AMD_PERF_STATUS 0xc0010063 @@ -427,6 +426,7 @@ #define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD_PPIN_CTL 0xc00102f0 #define MSR_AMD_PPIN 0xc00102f1 +#define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_BU_CFG2 0xc001102a @@ -466,6 +466,8 @@ #define MSR_F16H_DR0_ADDR_MASK 0xc0011027 /* Fam 15h MSRs */ +#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a +#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b #define MSR_F15H_PERF_CTL 0xc0010200 #define MSR_F15H_PERF_CTL0 MSR_F15H_PERF_CTL #define MSR_F15H_PERF_CTL1 (MSR_F15H_PERF_CTL + 2) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 76aa21e8128d..b836138ce852 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -999,15 +999,12 @@ extern int direct_gbpages; void init_mem_mapping(void); void early_alloc_pgt_buf(void); extern void memblock_find_dma_reserve(void); - - -#ifdef CONFIG_X86_64 -extern pgd_t trampoline_pgd_entry; - void __init poking_init(void); - unsigned long init_memory_mapping(unsigned long start, unsigned long end, pgprot_t prot); + +#ifdef CONFIG_X86_64 +extern pgd_t trampoline_pgd_entry; #endif /* local pte updates need not use xchg for locking */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 42cd333616c4..7c2ecdf7e064 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -457,10 +457,8 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); -#if IS_ENABLED(CONFIG_KVM) /* Save actual FS/GS selectors and bases to current->thread */ -void save_fsgs_for_kvm(void); -#endif +void current_save_fsgs(void); #else /* X86_64 */ #ifdef CONFIG_STACKPROTECTOR /* @@ -575,7 +573,7 @@ native_load_sp0(unsigned long sp0) this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } -static inline void native_swapgs(void) +static __always_inline void native_swapgs(void) { #ifdef CONFIG_X86_64 asm volatile("swapgs" ::: "memory"); diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 9804a7957f4e..7fb482f0f25b 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -90,6 +90,15 @@ static __always_inline void boot_init_stack_canary(void) #endif } +static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle) +{ +#ifdef CONFIG_X86_64 + per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary; +#else + per_cpu(stack_canary.canary, cpu) = idle->stack_canary; +#endif +} + static inline void setup_stack_canary_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -119,6 +128,9 @@ static inline void load_stack_canary_segment(void) static inline void setup_stack_canary_segment(int cpu) { } +static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle) +{ } + static inline void load_stack_canary_segment(void) { #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 79d8d5496330..f4234575f3fd 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -193,7 +193,7 @@ static inline void sched_clear_itmt_support(void) } #endif /* CONFIG_SCHED_MC_PRIO */ -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_X86_64) #include <asm/cpufeature.h> DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key); diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 8b2effe6efb8..5fdfcb47000f 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -5,4 +5,7 @@ /* MONITOR/MWAIT enabled in Ring 3 */ #define HWCAP2_RING3MWAIT (1 << 0) +/* Kernel allows FSGSBASE instructions available in Ring 3 */ +#define HWCAP2_FSGSBASE BIT(1) + #endif diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8fd39ff74a49..20e07feb4064 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -3,6 +3,7 @@ #include <linux/module.h> #include <linux/sched.h> +#include <linux/perf_event.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/stringify.h> @@ -53,7 +54,7 @@ __setup("noreplace-smp", setup_noreplace_smp); #define DPRINTK(fmt, args...) \ do { \ if (debug_alternative) \ - printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ + printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) #define DUMP_BYTES(buf, len, fmt, args...) \ @@ -64,7 +65,7 @@ do { \ if (!(len)) \ break; \ \ - printk(KERN_DEBUG fmt, ##args); \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ for (j = 0; j < (len) - 1; j++) \ printk(KERN_CONT "%02hhx ", buf[j]); \ printk(KERN_CONT "%02hhx\n", buf[j]); \ @@ -1001,6 +1002,7 @@ struct text_poke_loc { s32 rel32; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; + u8 old; }; struct bp_patching_desc { @@ -1168,8 +1170,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries /* * First step: add a int3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) + for (i = 0; i < nr_entries; i++) { + tp[i].old = *(u8 *)text_poke_addr(&tp[i]); text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + } text_poke_sync(); @@ -1177,14 +1181,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { + u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; int len = text_opcode_size(tp[i].opcode); if (len - INT3_INSN_SIZE > 0) { + memcpy(old + INT3_INSN_SIZE, + text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, (const char *)tp[i].text + INT3_INSN_SIZE, len - INT3_INSN_SIZE); do_sync++; } + + /* + * Emit a perf event to record the text poke, primarily to + * support Intel PT decoding which must walk the executable code + * to reconstruct the trace. The flow up to here is: + * - write INT3 byte + * - IPI-SYNC + * - write instruction tail + * At this point the actual control flow will be through the + * INT3 and handler and not hit the old or new instruction. + * Intel PT outputs FUP/TIP packets for the INT3, so the flow + * can still be decoded. Subsequently: + * - emit RECORD_TEXT_POKE with the new instruction + * - IPI-SYNC + * - write first byte + * - IPI-SYNC + * So before the text poke event timestamp, the decoder will see + * either the old instruction flow or FUP/TIP of INT3. After the + * text poke event timestamp, the decoder will see either the + * new instruction flow or FUP/TIP of INT3. Thus decoders can + * use the timestamp as the point at which to modify the + * executable code. + * The old instruction is recorded so that the event can be + * processed forwards or backwards. + */ + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, + tp[i].text, len); } if (do_sync) { diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0b71970d2d3d..f0b743a2fe9c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -543,14 +543,12 @@ static void __init spectre_v1_select_mitigation(void) * If FSGSBASE is enabled, the user can put a kernel address in * GS, in which case SMAP provides no protection. * - * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the - * FSGSBASE enablement patches have been merged. ] - * * If FSGSBASE is disabled, the user can only put a user space * address in GS. That makes an attack harder, but still * possible if there's no SMAP protection. */ - if (!smap_works_speculatively()) { + if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + !smap_works_speculatively()) { /* * Mitigation can be provided from SWAPGS itself or * PTI as the CR3 write in the Meltdown mitigation @@ -763,10 +761,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) } /* - * If enhanced IBRS is enabled or SMT impossible, STIBP is not + * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not * required. */ - if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (!boot_cpu_has(X86_FEATURE_STIBP) || + !smt_possible || + spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return; /* @@ -778,12 +778,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) mode = SPECTRE_V2_USER_STRICT_PREFERRED; - /* - * If STIBP is not available, clear the STIBP mode. - */ - if (!boot_cpu_has(X86_FEATURE_STIBP)) - mode = SPECTRE_V2_USER_NONE; - spectre_v2_user_stibp = mode; set_mode: @@ -1270,7 +1264,6 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) * Indirect branch speculation is always disabled in strict * mode. It can neither be enabled if it was force-disabled * by a previous prctl call. - */ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 426792565d86..c5cf336e5077 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -3,6 +3,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> +#include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/e820/api.h> #include <asm/mtrr.h> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 043d93cdcaad..965474d78cef 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -347,6 +347,9 @@ out: cr4_clear_bits(X86_CR4_UMIP); } +/* These bits should not change their value after CPU init is finished. */ +static const unsigned long cr4_pinned_mask = + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE; static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); static unsigned long cr4_pinned_bits __ro_after_init; @@ -371,20 +374,20 @@ EXPORT_SYMBOL(native_write_cr0); void native_write_cr4(unsigned long val) { - unsigned long bits_missing = 0; + unsigned long bits_changed = 0; set_register: asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); if (static_branch_likely(&cr_pinning)) { - if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) { - bits_missing = ~val & cr4_pinned_bits; - val |= bits_missing; + if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) { + bits_changed = (val & cr4_pinned_mask) ^ cr4_pinned_bits; + val = (val & ~cr4_pinned_mask) | cr4_pinned_bits; goto set_register; } - /* Warn after we've set the missing bits. */ - WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n", - bits_missing); + /* Warn after we've corrected the changed bits. */ + WARN_ONCE(bits_changed, "pinned CR4 bits changed: 0x%lx!?\n", + bits_changed); } } #if IS_MODULE(CONFIG_LKDTM) @@ -419,7 +422,7 @@ void cr4_init(void) if (boot_cpu_has(X86_FEATURE_PCID)) cr4 |= X86_CR4_PCIDE; if (static_branch_likely(&cr_pinning)) - cr4 |= cr4_pinned_bits; + cr4 = (cr4 & ~cr4_pinned_mask) | cr4_pinned_bits; __write_cr4(cr4); @@ -434,13 +437,26 @@ void cr4_init(void) */ static void __init setup_cr_pinning(void) { - unsigned long mask; - - mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP); - cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask; + cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & cr4_pinned_mask; static_key_enable(&cr_pinning.key); } +static __init int x86_nofsgsbase_setup(char *arg) +{ + /* Require an exact match without trailing characters. */ + if (strlen(arg)) + return 0; + + /* Do not emit a message if the feature is not present. */ + if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); + pr_info("FSGSBASE disabled via kernel command line\n"); + return 1; +} +__setup("nofsgsbase", x86_nofsgsbase_setup); + /* * Protection Keys are not available in 32-bit mode. */ @@ -1495,6 +1511,12 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { + cr4_set_bits(X86_CR4_FSGSBASE); + elf_hwcap2 |= HWCAP2_FSGSBASE; + } + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index fb538fccd24c..9d033693519a 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -81,8 +81,4 @@ extern void update_srbds_msr(void); extern u64 x86_read_arch_cap_msr(void); -#ifdef CONFIG_IA32_FEAT_CTL -void init_ia32_feat_ctl(struct cpuinfo_x86 *c); -#endif - #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index ce9120c4f740..fbe89a92ff36 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1083,7 +1083,7 @@ static noinstr bool mce_check_crashing_cpu(void) { unsigned int cpu = smp_processor_id(); - if (cpu_is_offline(cpu) || + if (arch_cpu_is_offline(cpu) || (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 43c466020ed5..03e51053592a 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -345,7 +345,7 @@ static __init int dev_mcelog_init_device(void) int err; mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus()); - mcelog = kzalloc(sizeof(*mcelog) + mce_log_len * sizeof(struct mce), GFP_KERNEL); + mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL); if (!mcelog) return -ENOMEM; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 0593b192eb8f..7843ab3fde09 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -511,7 +511,7 @@ static void do_inject(void) */ if (inj_type == DFR_INT_INJ) { i_mce.status |= MCI_STATUS_DEFERRED; - i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + i_mce.status &= ~MCI_STATUS_UC; } /* diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index baec68b7e010..ec6f0415bc6d 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -145,7 +145,6 @@ extern struct builtin_fw __end_builtin_fw[]; bool get_builtin_firmware(struct cpio_data *cd, const char *name) { -#ifdef CONFIG_FW_LOADER struct builtin_fw *b_fw; for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { @@ -155,7 +154,6 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name) return true; } } -#endif return false; } diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 12f967c6b603..6a9df71c1b9e 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -981,10 +981,10 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) c->x86_cache_max_rmid = ecx; c->x86_cache_occ_scale = ebx; - if (c->x86_vendor == X86_VENDOR_INTEL) - c->x86_cache_mbm_width_offset = eax & 0xff; - else - c->x86_cache_mbm_width_offset = -1; + c->x86_cache_mbm_width_offset = eax & 0xff; + + if (c->x86_vendor == X86_VENDOR_AMD && !c->x86_cache_mbm_width_offset) + c->x86_cache_mbm_width_offset = MBM_CNTR_WIDTH_OFFSET_AMD; } } diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index f20a47d120b1..5ffa32256b3b 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -37,6 +37,7 @@ #define MBA_IS_LINEAR 0x4 #define MBA_MAX_MBPS U32_MAX #define MAX_MBA_BW_AMD 0x800 +#define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_UNAVAIL BIT_ULL(62) diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 23b4b61319d3..3f844f14fc0a 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1117,6 +1117,7 @@ static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d, _d_cdp = rdt_find_domain(_r_cdp, d->id, NULL); if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) { _r_cdp = NULL; + _d_cdp = NULL; ret = -EINVAL; } diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index df1358ba622b..05fa4ef63490 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -2,6 +2,7 @@ #include <linux/sched.h> #include <linux/sched/clock.h> +#include <asm/cpu.h> #include <asm/cpufeature.h> #include "cpu.h" diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 0db21206f2f3..7ecf9babf0cb 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -160,7 +160,7 @@ static const __initconst struct idt_data apic_idts[] = { /* Must be page-aligned because the real IDT is used in the cpu entry area */ static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; -struct desc_ptr idt_descr __ro_after_init = { +static struct desc_ptr idt_descr __ro_after_init = { .size = IDT_TABLE_SIZE - 1, .address = (unsigned long) idt_table, }; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index ada39ddbc922..fdadc37d72af 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -33,6 +33,7 @@ #include <linux/hardirq.h> #include <linux/preempt.h> #include <linux/sched/debug.h> +#include <linux/perf_event.h> #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> @@ -472,6 +473,9 @@ static int arch_copy_kprobe(struct kprobe *p) /* Also, displacement change doesn't affect the first byte */ p->opcode = buf[0]; + p->ainsn.tp_len = len; + perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len); + /* OK, write back the instruction(s) into ROX insn buffer */ text_poke(p->ainsn.insn, buf, len); @@ -503,12 +507,18 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_arm_kprobe(struct kprobe *p) { - text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1); + u8 int3 = INT3_INSN_OPCODE; + + text_poke(p->addr, &int3, 1); text_poke_sync(); + perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } void arch_disarm_kprobe(struct kprobe *p) { + u8 int3 = INT3_INSN_OPCODE; + + perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); text_poke_sync(); } @@ -516,6 +526,9 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { + /* Record the perf event before freeing the slot */ + perf_event_text_poke(p->ainsn.insn, p->ainsn.insn, + p->ainsn.tp_len, NULL, 0); free_insn_slot(p->ainsn.insn, p->ainsn.boostable); p->ainsn.insn = NULL; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7af4c61dde52..40f380461e6d 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -6,6 +6,7 @@ * Copyright (C) Hitachi Ltd., 2012 */ #include <linux/kprobes.h> +#include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/string.h> #include <linux/slab.h> @@ -352,8 +353,15 @@ int arch_within_optimized_kprobe(struct optimized_kprobe *op, static void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) { - if (op->optinsn.insn) { - free_optinsn_slot(op->optinsn.insn, dirty); + u8 *slot = op->optinsn.insn; + if (slot) { + int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; + + /* Record the perf event before freeing the slot */ + if (dirty) + perf_event_text_poke(slot, slot, len, NULL, 0); + + free_optinsn_slot(slot, dirty); op->optinsn.insn = NULL; op->optinsn.size = 0; } @@ -424,8 +432,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, (u8 *)op->kp.addr + op->optinsn.size); len += JMP32_INSN_SIZE; + /* + * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also + * used in __arch_remove_optimized_kprobe(). + */ + /* We have to use text_poke() for instruction buffer because it is RO */ + perf_event_text_poke(slot, NULL, 0, buf, len); text_poke(slot, buf, len); + ret = 0; out: kfree(buf); @@ -477,10 +492,23 @@ void arch_optimize_kprobes(struct list_head *oplist) */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - arch_arm_kprobe(&op->kp); - text_poke(op->kp.addr + INT3_INSN_SIZE, - op->optinsn.copied_insn, DISP32_SIZE); + u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; + u8 old[JMP32_INSN_SIZE]; + u8 *addr = op->kp.addr; + + memcpy(old, op->kp.addr, JMP32_INSN_SIZE); + memcpy(new + INT3_INSN_SIZE, + op->optinsn.copied_insn, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + + text_poke(addr, new, INT3_INSN_SIZE); text_poke_sync(); + text_poke(addr + INT3_INSN_SIZE, + new + INT3_INSN_SIZE, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + text_poke_sync(); + + perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } /* diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 1547be359d7f..576c43e39247 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -42,6 +42,14 @@ static struct class *msr_class; static enum cpuhp_state cpuhp_msr_state; +enum allow_write_msrs { + MSR_WRITES_ON, + MSR_WRITES_OFF, + MSR_WRITES_DEFAULT, +}; + +static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT; + static ssize_t msr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -70,6 +78,24 @@ static ssize_t msr_read(struct file *file, char __user *buf, return bytes ? bytes : err; } +static int filter_write(u32 reg) +{ + switch (allow_writes) { + case MSR_WRITES_ON: return 0; break; + case MSR_WRITES_OFF: return -EPERM; break; + default: break; + } + + if (reg == MSR_IA32_ENERGY_PERF_BIAS) + return 0; + + pr_err_ratelimited("Write to unrecognized MSR 0x%x by %s\n" + "Please report to x86@kernel.org\n", + reg, current->comm); + + return 0; +} + static ssize_t msr_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -84,6 +110,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf, if (err) return err; + err = filter_write(reg); + if (err) + return err; + if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -92,9 +122,13 @@ static ssize_t msr_write(struct file *file, const char __user *buf, err = -EFAULT; break; } + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); if (err) break; + tmp += 2; bytes += 8; } @@ -242,6 +276,41 @@ static void __exit msr_exit(void) } module_exit(msr_exit) +static int set_allow_writes(const char *val, const struct kernel_param *cp) +{ + /* val is NUL-terminated, see kernfs_fop_write() */ + char *s = strstrip((char *)val); + + if (!strcmp(s, "on")) + allow_writes = MSR_WRITES_ON; + else if (!strcmp(s, "off")) + allow_writes = MSR_WRITES_OFF; + else + allow_writes = MSR_WRITES_DEFAULT; + + return 0; +} + +static int get_allow_writes(char *buf, const struct kernel_param *kp) +{ + const char *res; + + switch (allow_writes) { + case MSR_WRITES_ON: res = "on"; break; + case MSR_WRITES_OFF: res = "off"; break; + default: res = "default"; break; + } + + return sprintf(buf, "%s\n", res); +} + +static const struct kernel_param_ops allow_writes_ops = { + .set = set_allow_writes, + .get = get_allow_writes +}; + +module_param_cb(allow_writes, &allow_writes_ops, NULL, 0600); + MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); MODULE_DESCRIPTION("x86 generic MSR driver"); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 2de365f15684..d7c5e44b26f7 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -478,7 +478,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { - if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id())) + if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f362ce0d5ac0..216c88df1919 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -140,10 +140,12 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); #ifdef CONFIG_X86_64 - savesegment(gs, p->thread.gsindex); - p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase; - savesegment(fs, p->thread.fsindex); - p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase; + current_save_fsgs(); + p->thread.fsindex = current->thread.fsindex; + p->thread.fsbase = current->thread.fsbase; + p->thread.gsindex = current->thread.gsindex; + p->thread.gsbase = current->thread.gsbase; + savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); #else diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9a97415b2139..d618969cea66 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -150,6 +150,44 @@ enum which_selector { }; /* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr unsigned long __rdgsbase_inactive(void) +{ + unsigned long gsbase; + + lockdep_assert_irqs_disabled(); + + native_swapgs(); + gsbase = rdgsbase(); + native_swapgs(); + + return gsbase; +} + +/* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr void __wrgsbase_inactive(unsigned long gsbase) +{ + lockdep_assert_irqs_disabled(); + + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); +} + +/* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. * It's forcibly inlined because it'll generate better code and this function @@ -198,22 +236,35 @@ static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); - save_base_legacy(task, task->thread.fsindex, FS); - save_base_legacy(task, task->thread.gsindex, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* + * If FSGSBASE is enabled, we can't make any useful guesses + * about the base, and user code expects us to save the current + * value. Fortunately, reading the base directly is efficient. + */ + task->thread.fsbase = rdfsbase(); + task->thread.gsbase = __rdgsbase_inactive(); + } else { + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); + } } -#if IS_ENABLED(CONFIG_KVM) /* * While a process is running,current->thread.fsbase and current->thread.gsbase - * may not match the corresponding CPU registers (see save_base_legacy()). KVM - * wants an efficient way to save and restore FSBASE and GSBASE. - * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE. + * may not match the corresponding CPU registers (see save_base_legacy()). */ -void save_fsgs_for_kvm(void) +void current_save_fsgs(void) { + unsigned long flags; + + /* Interrupts need to be off for FSGSBASE */ + local_irq_save(flags); save_fsgs(current); + local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(save_fsgs_for_kvm); +#if IS_ENABLED(CONFIG_KVM) +EXPORT_SYMBOL_GPL(current_save_fsgs); #endif static __always_inline void loadseg(enum which_selector which, @@ -278,10 +329,22 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* Update the FS and GS selectors if they could have changed. */ + if (unlikely(prev->fsindex || next->fsindex)) + loadseg(FS, next->fsindex); + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + /* Update the bases. */ + wrfsbase(next->fsbase); + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); + } } static unsigned long x86_fsgsbase_read_task(struct task_struct *task, @@ -327,13 +390,44 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, return base; } +unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + gsbase = __rdgsbase_inactive(); + local_irq_restore(flags); + } else { + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + } + + return gsbase; +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + __wrgsbase_inactive(gsbase); + local_irq_restore(flags); + } else { + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + } +} + unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (task->thread.fsindex == 0) + else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.fsindex == 0)) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); @@ -347,7 +441,8 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (task->thread.gsindex == 0) + else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.gsindex == 0)) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 44130588987f..1c7646c8d0fe 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -379,25 +379,12 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - /* - * When changing the FS base, use do_arch_prctl_64() - * to set the index to zero and to set the base - * as requested. - * - * NB: This behavior is nonsensical and likely needs to - * change when FSGSBASE support is added. - */ - if (child->thread.fsbase != value) - return do_arch_prctl_64(child, ARCH_SET_FS, value); + x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): - /* - * Exactly the same here as the %fs handling above. - */ if (value >= TASK_SIZE_MAX) return -EIO; - if (child->thread.gsbase != value) - return do_arch_prctl_64(child, ARCH_SET_GS, value); + x86_gsbase_write_task(child, value); return 0; #endif } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ffbd9a3d78d8..27aa04a95702 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -51,11 +51,11 @@ #include <linux/err.h> #include <linux/nmi.h> #include <linux/tboot.h> -#include <linux/stackprotector.h> #include <linux/gfp.h> #include <linux/cpuidle.h> #include <linux/numa.h> #include <linux/pgtable.h> +#include <linux/overflow.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -80,6 +80,7 @@ #include <asm/cpu_device_id.h> #include <asm/spec-ctrl.h> #include <asm/hw_irq.h> +#include <asm/stackprotector.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -259,21 +260,10 @@ static void notrace start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); - /* to prevent fake stack check failure in clock setup */ - boot_init_stack_canary(); - x86_cpuinit.setup_percpu_clockev(); wmb(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); - - /* - * Prevent tail call to cpu_startup_entry() because the stack protector - * guard has been changed a couple of function calls up, in - * boot_init_stack_canary() and must not be checked before tail calling - * another function. - */ - prevent_tail_call_optimization(); } /** @@ -1011,6 +1001,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) alternatives_enable_smp(); per_cpu(current_task, cpu) = idle; + cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ ret = irq_init_percpu_irqstack(cpu); @@ -1777,6 +1768,7 @@ void native_play_dead(void) #endif +#ifdef CONFIG_X86_64 /* * APERF/MPERF frequency ratio computation. * @@ -1975,6 +1967,7 @@ static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) static bool intel_set_max_freq_ratio(void) { u64 base_freq, turbo_freq; + u64 turbo_ratio; if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; @@ -2000,15 +1993,23 @@ out: /* * Some hypervisors advertise X86_FEATURE_APERFMPERF * but then fill all MSR's with zeroes. + * Some CPUs have turbo boost but don't declare any turbo ratio + * in MSR_TURBO_RATIO_LIMIT. */ - if (!base_freq) { - pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n"); + if (!base_freq || !turbo_freq) { + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); return false; } - arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, - base_freq); + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); + if (!turbo_ratio) { + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); + return false; + } + + arch_turbo_freq_ratio = turbo_ratio; arch_set_max_freq_ratio(turbo_disabled()); + return true; } @@ -2048,11 +2049,19 @@ static void init_freq_invariance(bool secondary) } } +static void disable_freq_invariance_workfn(struct work_struct *work) +{ + static_branch_disable(&arch_scale_freq_key); +} + +static DECLARE_WORK(disable_freq_invariance_work, + disable_freq_invariance_workfn); + DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void arch_scale_freq_tick(void) { - u64 freq_scale; + u64 freq_scale = SCHED_CAPACITY_SCALE; u64 aperf, mperf; u64 acnt, mcnt; @@ -2064,19 +2073,32 @@ void arch_scale_freq_tick(void) acnt = aperf - this_cpu_read(arch_prev_aperf); mcnt = mperf - this_cpu_read(arch_prev_mperf); - if (!mcnt) - return; this_cpu_write(arch_prev_aperf, aperf); this_cpu_write(arch_prev_mperf, mperf); - acnt <<= 2*SCHED_CAPACITY_SHIFT; - mcnt *= arch_max_freq_ratio; + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) + goto error; + + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + goto error; freq_scale = div64_u64(acnt, mcnt); + if (!freq_scale) + goto error; if (freq_scale > SCHED_CAPACITY_SCALE) freq_scale = SCHED_CAPACITY_SCALE; this_cpu_write(arch_freq_scale, freq_scale); + return; + +error: + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); + schedule_work(&disable_freq_invariance_work); +} +#else +static inline void init_freq_invariance(bool secondary) +{ } +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cb22f33bf1d8..a58453f7058e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1172,7 +1172,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { - save_fsgs_for_kvm(); + current_save_fsgs(); fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index fff28c6f73a2..b0dfac3d3df7 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -24,6 +24,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size) asm volatile( " testq %[size8],%[size8]\n" " jz 4f\n" + " .align 16\n" "0: movq $0,(%[dst])\n" " addq $8,%[dst]\n" " decl %%ecx ; jnz 0b\n" diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 001dd7dc829f..c7a47603537f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -25,6 +25,7 @@ #include <asm/cpufeature.h> #include <asm/pti.h> #include <asm/text-patching.h> +#include <asm/memtype.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -912,8 +913,6 @@ void free_kernel_image_pages(const char *what, void *begin, void *end) set_memory_np_noalias(begin_ul, len_pages); } -void __weak mem_encrypt_free_decrypted_mem(void) { } - void __ref free_initmem(void) { e820__reallocate_tables(); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 4a781cf99e92..9f1177edc2e7 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -376,7 +376,6 @@ bool force_dma_unencrypted(struct device *dev) return false; } -/* Architecture __weak replacement functions */ void __init mem_encrypt_free_decrypted_mem(void) { unsigned long vaddr, vaddr_end, npages; @@ -401,6 +400,7 @@ void __init mem_encrypt_free_decrypted_mem(void) free_init_pages("unused decrypted", vaddr, vaddr_end); } +/* Architecture __weak replacement functions */ void __init mem_encrypt_init(void) { if (!sme_me_mask) diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 7c65102debaf..db1378c6ff26 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -193,6 +193,8 @@ static void fix_processor_context(void) */ static void notrace __restore_processor_state(struct saved_context *ctxt) { + struct cpuinfo_x86 *c; + if (ctxt->misc_enable_saved) wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable); /* @@ -263,6 +265,10 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) mtrr_bp_restore(); perf_restore_debug_store(); msr_restore_context(ctxt); + + c = &cpu_data(smp_processor_id()); + if (cpu_has(c, X86_FEATURE_MSR_IA32_FEAT_CTL)) + init_ia32_feat_ctl(c); } /* Needed by apm.c */ diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 171aff1b11f2..9ea598dcc132 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -92,9 +92,7 @@ static void cpu_bringup(void) asmlinkage __visible void cpu_bringup_and_idle(void) { cpu_bringup(); - boot_init_stack_canary(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); - prevent_tail_call_optimization(); } void xen_smp_intr_free_pv(unsigned int cpu) diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c index 3655d9d3f5df..198a8eb1cd56 100644 --- a/crypto/crypto_engine.c +++ b/crypto/crypto_engine.c @@ -482,7 +482,6 @@ struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev, int (*cbk_do_batch)(struct crypto_engine *engine), bool rt, int qlen) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO / 2 }; struct crypto_engine *engine; if (!dev) @@ -520,7 +519,7 @@ struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev, if (engine->rt) { dev_info(dev, "will run requests pump with realtime priority\n"); - sched_setscheduler(engine->kworker->task, SCHED_FIFO, ¶m); + sched_set_fifo(engine->kworker->task); } return engine; diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index e7dc0133f817..0ad9f2059e6d 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -136,12 +136,11 @@ static unsigned int idle_pct = 5; /* percentage */ static unsigned int round_robin_time = 1; /* second */ static int power_saving_thread(void *data) { - struct sched_param param = {.sched_priority = 1}; int do_sleep; unsigned int tsk_index = (unsigned long)data; u64 last_jiffies = 0; - sched_setscheduler(current, SCHED_RR, ¶m); + sched_set_fifo_low(current); while (!kthread_should_stop()) { unsigned long expire_time; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 3a3f2b6a821f..280615efef74 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -6019,11 +6019,8 @@ int drbd_ack_receiver(struct drbd_thread *thi) unsigned int header_size = drbd_header_size(connection); int expect = header_size; bool ping_timeout_active = false; - struct sched_param param = { .sched_priority = 2 }; - rv = sched_setscheduler(current, SCHED_RR, ¶m); - if (rv < 0) - drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv); + sched_set_fifo_low(current); while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig index e6fc022bc87e..3939699e62fe 100644 --- a/drivers/firmware/efi/Kconfig +++ b/drivers/firmware/efi/Kconfig @@ -278,3 +278,14 @@ config EFI_EARLYCON depends on SERIAL_EARLYCON && !ARM && !IA64 select FONT_SUPPORT select ARCH_USE_MEMREMAP_PROT + +config EFI_CUSTOM_SSDT_OVERLAYS + bool "Load custom ACPI SSDT overlay from an EFI variable" + depends on EFI_VARS && ACPI + default ACPI_TABLE_UPGRADE + help + Allow loading of an ACPI SSDT overlay from an EFI variable specified + by a kernel command line option. + + See Documentation/admin-guide/acpi/ssdt-overlays.rst for more + information. diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index c697e70ca7e7..71c445d20258 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -52,9 +52,11 @@ static phys_addr_t __init efi_to_phys(unsigned long addr) } static __initdata unsigned long screen_info_table = EFI_INVALID_TABLE_ADDR; +static __initdata unsigned long cpu_state_table = EFI_INVALID_TABLE_ADDR; static const efi_config_table_type_t arch_tables[] __initconst = { {LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, &screen_info_table}, + {LINUX_EFI_ARM_CPU_STATE_TABLE_GUID, &cpu_state_table}, {} }; @@ -62,7 +64,8 @@ static void __init init_screen_info(void) { struct screen_info *si; - if (screen_info_table != EFI_INVALID_TABLE_ADDR) { + if (IS_ENABLED(CONFIG_ARM) && + screen_info_table != EFI_INVALID_TABLE_ADDR) { si = early_memremap_ro(screen_info_table, sizeof(*si)); if (!si) { pr_err("Could not map screen_info config table\n"); @@ -116,7 +119,8 @@ static int __init uefi_init(u64 efi_system_table) goto out; } retval = efi_config_parse_tables(config_tables, systab->nr_tables, - arch_tables); + IS_ENABLED(CONFIG_ARM) ? arch_tables + : NULL); early_memunmap(config_tables, table_size); out: @@ -238,9 +242,37 @@ void __init efi_init(void) init_screen_info(); +#ifdef CONFIG_ARM /* ARM does not permit early mappings to persist across paging_init() */ - if (IS_ENABLED(CONFIG_ARM)) - efi_memmap_unmap(); + efi_memmap_unmap(); + + if (cpu_state_table != EFI_INVALID_TABLE_ADDR) { + struct efi_arm_entry_state *state; + bool dump_state = true; + + state = early_memremap_ro(cpu_state_table, + sizeof(struct efi_arm_entry_state)); + if (state == NULL) { + pr_warn("Unable to map CPU entry state table.\n"); + return; + } + + if ((state->sctlr_before_ebs & 1) == 0) + pr_warn(FW_BUG "EFI stub was entered with MMU and Dcache disabled, please fix your firmware!\n"); + else if ((state->sctlr_after_ebs & 1) == 0) + pr_warn(FW_BUG "ExitBootServices() returned with MMU and Dcache disabled, please fix your firmware!\n"); + else + dump_state = false; + + if (dump_state || efi_enabled(EFI_DBG)) { + pr_info("CPSR at EFI stub entry : 0x%08x\n", state->cpsr_before_ebs); + pr_info("SCTLR at EFI stub entry : 0x%08x\n", state->sctlr_before_ebs); + pr_info("CPSR after ExitBootServices() : 0x%08x\n", state->cpsr_after_ebs); + pr_info("SCTLR after ExitBootServices(): 0x%08x\n", state->sctlr_after_ebs); + } + early_memunmap(state, sizeof(struct efi_arm_entry_state)); + } +#endif } static bool efifb_overlaps_pci_range(const struct of_pci_range *range) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 7f1657b6c30d..5114cae4ec97 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -189,7 +189,7 @@ static void generic_ops_unregister(void) efivars_unregister(&generic_efivars); } -#if IS_ENABLED(CONFIG_ACPI) +#ifdef CONFIG_EFI_CUSTOM_SSDT_OVERLAYS #define EFIVAR_SSDT_NAME_MAX 16 static char efivar_ssdt[EFIVAR_SSDT_NAME_MAX] __initdata; static int __init efivar_ssdt_setup(char *str) @@ -622,7 +622,8 @@ int __init efi_config_parse_tables(const efi_config_table_t *config_tables, rsv = (void *)(p + prsv % PAGE_SIZE); /* reserve the entry itself */ - memblock_reserve(prsv, EFI_MEMRESERVE_SIZE(rsv->size)); + memblock_reserve(prsv, + struct_size(rsv, entry, rsv->size)); for (i = 0; i < atomic_read(&rsv->count); i++) { memblock_reserve(rsv->entry[i].base, diff --git a/drivers/firmware/efi/esrt.c b/drivers/firmware/efi/esrt.c index e3d692696583..d5915272141f 100644 --- a/drivers/firmware/efi/esrt.c +++ b/drivers/firmware/efi/esrt.c @@ -181,7 +181,7 @@ static int esre_create_sysfs_entry(void *esre, int entry_num) rc = kobject_init_and_add(&entry->kobj, &esre1_ktype, NULL, "entry%d", entry_num); if (rc) { - kfree(entry); + kobject_put(&entry->kobj); return rc; } } diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 75daaf20374e..4cce372edaf4 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -6,7 +6,8 @@ # enabled, even if doing so doesn't break the build. # cflags-$(CONFIG_X86_32) := -march=i386 -cflags-$(CONFIG_X86_64) := -mcmodel=small +cflags-$(CONFIG_X86_64) := -mcmodel=small \ + $(call cc-option,-maccumulate-outgoing-args) cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ \ -fPIC -fno-strict-aliasing -mno-red-zone \ -mno-mmx -mno-sse -fshort-wchar \ diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c index 40243f524556..d08e5d55838c 100644 --- a/drivers/firmware/efi/libstub/arm32-stub.c +++ b/drivers/firmware/efi/libstub/arm32-stub.c @@ -7,10 +7,49 @@ #include "efistub.h" +static efi_guid_t cpu_state_guid = LINUX_EFI_ARM_CPU_STATE_TABLE_GUID; + +struct efi_arm_entry_state *efi_entry_state; + +static void get_cpu_state(u32 *cpsr, u32 *sctlr) +{ + asm("mrs %0, cpsr" : "=r"(*cpsr)); + if ((*cpsr & MODE_MASK) == HYP_MODE) + asm("mrc p15, 4, %0, c1, c0, 0" : "=r"(*sctlr)); + else + asm("mrc p15, 0, %0, c1, c0, 0" : "=r"(*sctlr)); +} + efi_status_t check_platform_features(void) { + efi_status_t status; + u32 cpsr, sctlr; int block; + get_cpu_state(&cpsr, &sctlr); + + efi_info("Entering in %s mode with MMU %sabled\n", + ((cpsr & MODE_MASK) == HYP_MODE) ? "HYP" : "SVC", + (sctlr & 1) ? "en" : "dis"); + + status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, + sizeof(*efi_entry_state), + (void **)&efi_entry_state); + if (status != EFI_SUCCESS) { + efi_err("allocate_pool() failed\n"); + return status; + } + + efi_entry_state->cpsr_before_ebs = cpsr; + efi_entry_state->sctlr_before_ebs = sctlr; + + status = efi_bs_call(install_configuration_table, &cpu_state_guid, + efi_entry_state); + if (status != EFI_SUCCESS) { + efi_err("install_configuration_table() failed\n"); + goto free_state; + } + /* non-LPAE kernels can run anywhere */ if (!IS_ENABLED(CONFIG_ARM_LPAE)) return EFI_SUCCESS; @@ -19,9 +58,22 @@ efi_status_t check_platform_features(void) block = cpuid_feature_extract(CPUID_EXT_MMFR0, 0); if (block < 5) { efi_err("This LPAE kernel is not supported by your CPU\n"); - return EFI_UNSUPPORTED; + status = EFI_UNSUPPORTED; + goto drop_table; } return EFI_SUCCESS; + +drop_table: + efi_bs_call(install_configuration_table, &cpu_state_guid, NULL); +free_state: + efi_bs_call(free_pool, efi_entry_state); + return status; +} + +void efi_handle_post_ebs_state(void) +{ + get_cpu_state(&efi_entry_state->cpsr_after_ebs, + &efi_entry_state->sctlr_after_ebs); } static efi_guid_t screen_info_guid = LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID; diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 89f075275300..d40fd68c6bb2 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -32,6 +32,10 @@ bool __pure __efi_soft_reserve_enabled(void) return !efi_nosoftreserve; } +/** + * efi_char16_puts() - Write a UCS-2 encoded string to the console + * @str: UCS-2 encoded string + */ void efi_char16_puts(efi_char16_t *str) { efi_call_proto(efi_table_attr(efi_system_table, con_out), @@ -83,6 +87,10 @@ u32 utf8_to_utf32(const u8 **s8) return c32; } +/** + * efi_puts() - Write a UTF-8 encoded string to the console + * @str: UTF-8 encoded string + */ void efi_puts(const char *str) { efi_char16_t buf[128]; @@ -113,6 +121,16 @@ void efi_puts(const char *str) } } +/** + * efi_printk() - Print a kernel message + * @fmt: format string + * + * The first letter of the format string is used to determine the logging level + * of the message. If the level is less then the current EFI logging level, the + * message is suppressed. The message will be truncated to 255 bytes. + * + * Return: number of printed characters + */ int efi_printk(const char *fmt, ...) { char printf_buf[256]; @@ -154,13 +172,18 @@ int efi_printk(const char *fmt, ...) return printed; } -/* - * Parse the ASCII string 'cmdline' for EFI options, denoted by the efi= +/** + * efi_parse_options() - Parse EFI command line options + * @cmdline: kernel command line + * + * Parse the ASCII string @cmdline for EFI options, denoted by the efi= * option, e.g. efi=nochunk. * * It should be noted that efi= is parsed in two very different * environments, first in the early boot environment of the EFI boot * stub, and subsequently during the kernel boot. + * + * Return: status code */ efi_status_t efi_parse_options(char const *cmdline) { @@ -286,13 +309,21 @@ char *efi_convert_cmdline(efi_loaded_image_t *image, int *cmd_line_len) return (char *)cmdline_addr; } -/* +/** + * efi_exit_boot_services() - Exit boot services + * @handle: handle of the exiting image + * @map: pointer to receive the memory map + * @priv: argument to be passed to @priv_func + * @priv_func: function to process the memory map before exiting boot services + * * Handle calling ExitBootServices according to the requirements set out by the * spec. Obtains the current memory map, and returns that info after calling * ExitBootServices. The client must specify a function to perform any * processing of the memory map data prior to ExitBootServices. A client * specific structure may be passed to the function via priv. The client * function may be called multiple times. + * + * Return: status code */ efi_status_t efi_exit_boot_services(void *handle, struct efi_boot_memmap *map, @@ -361,6 +392,11 @@ fail: return status; } +/** + * get_efi_config_table() - retrieve UEFI configuration table + * @guid: GUID of the configuration table to be retrieved + * Return: pointer to the configuration table or NULL + */ void *get_efi_config_table(efi_guid_t guid) { unsigned long tables = efi_table_attr(efi_system_table, tables); @@ -408,17 +444,18 @@ static const struct { }; /** - * efi_load_initrd_dev_path - load the initrd from the Linux initrd device path + * efi_load_initrd_dev_path() - load the initrd from the Linux initrd device path * @load_addr: pointer to store the address where the initrd was loaded * @load_size: pointer to store the size of the loaded initrd * @max: upper limit for the initrd memory allocation - * @return: %EFI_SUCCESS if the initrd was loaded successfully, in which - * case @load_addr and @load_size are assigned accordingly - * %EFI_NOT_FOUND if no LoadFile2 protocol exists on the initrd - * device path - * %EFI_INVALID_PARAMETER if load_addr == NULL or load_size == NULL - * %EFI_OUT_OF_RESOURCES if memory allocation failed - * %EFI_LOAD_ERROR in all other cases + * + * Return: + * * %EFI_SUCCESS if the initrd was loaded successfully, in which + * case @load_addr and @load_size are assigned accordingly + * * %EFI_NOT_FOUND if no LoadFile2 protocol exists on the initrd device path + * * %EFI_INVALID_PARAMETER if load_addr == NULL or load_size == NULL + * * %EFI_OUT_OF_RESOURCES if memory allocation failed + * * %EFI_LOAD_ERROR in all other cases */ static efi_status_t efi_load_initrd_dev_path(unsigned long *load_addr, @@ -481,6 +518,16 @@ efi_status_t efi_load_initrd_cmdline(efi_loaded_image_t *image, load_addr, load_size); } +/** + * efi_load_initrd() - Load initial RAM disk + * @image: EFI loaded image protocol + * @load_addr: pointer to loaded initrd + * @load_size: size of loaded initrd + * @soft_limit: preferred size of allocated memory for loading the initrd + * @hard_limit: minimum size of allocated memory + * + * Return: status code + */ efi_status_t efi_load_initrd(efi_loaded_image_t *image, unsigned long *load_addr, unsigned long *load_size, @@ -505,6 +552,15 @@ efi_status_t efi_load_initrd(efi_loaded_image_t *image, return status; } +/** + * efi_wait_for_key() - Wait for key stroke + * @usec: number of microseconds to wait for key stroke + * @key: key entered + * + * Wait for up to @usec microseconds for a key stroke. + * + * Return: status code, EFI_SUCCESS if key received + */ efi_status_t efi_wait_for_key(unsigned long usec, efi_input_key_t *key) { efi_event_t events[2], timer; diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index e97370bdfdb0..3318ec3f8e5b 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -329,6 +329,9 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, if (status != EFI_SUCCESS) goto fail_free_initrd; + if (IS_ENABLED(CONFIG_ARM)) + efi_handle_post_ebs_state(); + efi_enter_kernel(image_addr, fdt_addr, fdt_totalsize((void *)fdt_addr)); /* not reached */ diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index bcd8c0a785f0..2c9d42264c29 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -157,8 +157,14 @@ typedef void (__efiapi *efi_event_notify_t)(efi_event_t, void *); #define EFI_EVT_NOTIFY_WAIT 0x00000100U #define EFI_EVT_NOTIFY_SIGNAL 0x00000200U -/* - * boottime->wait_for_event takes an array of events as input. +/** + * efi_set_event_at() - add event to events array + * + * @events: array of UEFI events + * @ids: index where to put the event in the array + * @event: event to add to the aray + * + * boottime->wait_for_event() takes an array of events as input. * Provide a helper to set it up correctly for mixed mode. */ static inline @@ -771,4 +777,6 @@ efi_status_t efi_load_initrd(efi_loaded_image_t *image, unsigned long soft_limit, unsigned long hard_limit); +void efi_handle_post_ebs_state(void); + #endif diff --git a/drivers/firmware/efi/libstub/file.c b/drivers/firmware/efi/libstub/file.c index 2005e33b33d5..630caa6b1f4c 100644 --- a/drivers/firmware/efi/libstub/file.c +++ b/drivers/firmware/efi/libstub/file.c @@ -102,12 +102,20 @@ static int find_file_option(const efi_char16_t *cmdline, int cmdline_len, if (!found) return 0; + /* Skip any leading slashes */ + while (cmdline[i] == L'/' || cmdline[i] == L'\\') + i++; + while (--result_len > 0 && i < cmdline_len) { - if (cmdline[i] == L'\0' || - cmdline[i] == L'\n' || - cmdline[i] == L' ') + efi_char16_t c = cmdline[i++]; + + if (c == L'\0' || c == L'\n' || c == L' ') break; - *result++ = cmdline[i++]; + else if (c == L'/') + /* Replace UNIX dir separators with EFI standard ones */ + *result++ = L'\\'; + else + *result++ = c; } *result = L'\0'; return i; diff --git a/drivers/firmware/efi/libstub/skip_spaces.c b/drivers/firmware/efi/libstub/skip_spaces.c index a700b3c7f7d0..159fb4e456c6 100644 --- a/drivers/firmware/efi/libstub/skip_spaces.c +++ b/drivers/firmware/efi/libstub/skip_spaces.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/ctype.h> +#include <linux/string.h> #include <linux/types.h> char *skip_spaces(const char *str) diff --git a/drivers/firmware/psci/psci_checker.c b/drivers/firmware/psci/psci_checker.c index 873841af8d57..6fff482847e7 100644 --- a/drivers/firmware/psci/psci_checker.c +++ b/drivers/firmware/psci/psci_checker.c @@ -272,7 +272,6 @@ static int suspend_test_thread(void *arg) { int cpu = (long)arg; int i, nb_suspend = 0, nb_shallow_sleep = 0, nb_err = 0; - struct sched_param sched_priority = { .sched_priority = MAX_RT_PRIO-1 }; struct cpuidle_device *dev; struct cpuidle_driver *drv; /* No need for an actual callback, we just want to wake up the CPU. */ @@ -282,9 +281,7 @@ static int suspend_test_thread(void *arg) wait_for_completion(&suspend_threads_started); /* Set maximum priority to preempt all other threads on this CPU. */ - if (sched_setscheduler_nocheck(current, SCHED_FIFO, &sched_priority)) - pr_warn("Failed to set suspend thread scheduler on CPU %d\n", - cpu); + sched_set_fifo(current); dev = this_cpu_read(cpuidle_devices); drv = cpuidle_get_cpu_driver(dev); @@ -349,11 +346,6 @@ static int suspend_test_thread(void *arg) if (atomic_dec_return_relaxed(&nb_active_threads) == 0) complete(&suspend_threads_done); - /* Give up on RT scheduling and wait for termination. */ - sched_priority.sched_priority = 0; - if (sched_setscheduler_nocheck(current, SCHED_NORMAL, &sched_priority)) - pr_warn("Failed to set suspend thread scheduler on CPU %d\n", - cpu); for (;;) { /* Needs to be set first to avoid missing a wakeup. */ set_current_state(TASK_INTERRUPTIBLE); diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index c981cc10aebf..6c57cc72d627 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -389,7 +389,6 @@ static int msm_drm_init(struct device *dev, struct drm_driver *drv) struct msm_kms *kms; struct msm_mdss *mdss; int ret, i; - struct sched_param param; ddev = drm_dev_alloc(drv, dev); if (IS_ERR(ddev)) { @@ -495,12 +494,6 @@ static int msm_drm_init(struct device *dev, struct drm_driver *drv) ddev->mode_config.funcs = &mode_config_funcs; ddev->mode_config.helper_private = &mode_config_helper_funcs; - /** - * this priority was found during empiric testing to have appropriate - * realtime scheduling to process display updates and interact with - * other real time and normal priority task - */ - param.sched_priority = 16; for (i = 0; i < priv->num_crtcs; i++) { /* initialize event thread */ priv->event_thread[i].crtc_id = priv->crtcs[i]->base.id; @@ -516,11 +509,7 @@ static int msm_drm_init(struct device *dev, struct drm_driver *drv) goto err_msm_uninit; } - ret = sched_setscheduler(priv->event_thread[i].thread, - SCHED_FIFO, ¶m); - if (ret) - dev_warn(dev, "event_thread set priority failed:%d\n", - ret); + sched_set_fifo(priv->event_thread[i].thread); } ret = drm_vblank_init(ddev, priv->num_crtcs); diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 2f319102ae9f..17cf77e2c4e5 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -760,11 +760,10 @@ static bool drm_sched_blocked(struct drm_gpu_scheduler *sched) */ static int drm_sched_main(void *param) { - struct sched_param sparam = {.sched_priority = 1}; struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param; int r; - sched_setscheduler(current, SCHED_FIFO, &sparam); + sched_set_fifo_low(current); while (!kthread_should_stop()) { struct drm_sched_entity *entity = NULL; diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c index 267eac00a3fb..29f5fed28c2a 100644 --- a/drivers/hwmon/fam15h_power.c +++ b/drivers/hwmon/fam15h_power.c @@ -41,10 +41,6 @@ MODULE_LICENSE("GPL"); /* set maximum interval as 1 second */ #define MAX_INTERVAL 1000 -#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a -#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b -#define MSR_F15H_PTSC 0xc0010280 - #define PCI_DEVICE_ID_AMD_15H_M70H_NB_F4 0x15b4 struct fam15h_power_data { diff --git a/drivers/media/pci/ivtv/ivtv-driver.c b/drivers/media/pci/ivtv/ivtv-driver.c index b1dde1be6f5e..28acb14490d5 100644 --- a/drivers/media/pci/ivtv/ivtv-driver.c +++ b/drivers/media/pci/ivtv/ivtv-driver.c @@ -737,8 +737,6 @@ done: */ static int ivtv_init_struct1(struct ivtv *itv) { - struct sched_param param = { .sched_priority = 99 }; - itv->base_addr = pci_resource_start(itv->pdev, 0); itv->enc_mbox.max_mbox = 2; /* the encoder has 3 mailboxes (0-2) */ itv->dec_mbox.max_mbox = 1; /* the decoder has 2 mailboxes (0-1) */ @@ -758,7 +756,7 @@ static int ivtv_init_struct1(struct ivtv *itv) return -1; } /* must use the FIFO scheduler as it is realtime sensitive */ - sched_setscheduler(itv->irq_worker_task, SCHED_FIFO, ¶m); + sched_set_fifo(itv->irq_worker_task); kthread_init_work(&itv->irq_work, ivtv_irq_work_handler); diff --git a/drivers/mmc/core/sdio_irq.c b/drivers/mmc/core/sdio_irq.c index 3ffe4ff49aa7..4b1f7c966ec8 100644 --- a/drivers/mmc/core/sdio_irq.c +++ b/drivers/mmc/core/sdio_irq.c @@ -139,11 +139,10 @@ EXPORT_SYMBOL_GPL(sdio_signal_irq); static int sdio_irq_thread(void *_host) { struct mmc_host *host = _host; - struct sched_param param = { .sched_priority = 1 }; unsigned long period, idle_period; int ret; - sched_setscheduler(current, SCHED_FIFO, ¶m); + sched_set_fifo_low(current); /* * We want to allow for SDIO cards to work even on non SDIO diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c index debea5c4c829..d09260382550 100644 --- a/drivers/platform/chrome/cros_ec_spi.c +++ b/drivers/platform/chrome/cros_ec_spi.c @@ -709,9 +709,6 @@ static void cros_ec_spi_high_pri_release(void *worker) static int cros_ec_spi_devm_high_pri_alloc(struct device *dev, struct cros_ec_spi *ec_spi) { - struct sched_param sched_priority = { - .sched_priority = MAX_RT_PRIO / 2, - }; int err; ec_spi->high_pri_worker = @@ -728,11 +725,9 @@ static int cros_ec_spi_devm_high_pri_alloc(struct device *dev, if (err) return err; - err = sched_setscheduler_nocheck(ec_spi->high_pri_worker->task, - SCHED_FIFO, &sched_priority); - if (err) - dev_err(dev, "Can't set cros_ec high pri priority: %d\n", err); - return err; + sched_set_fifo(ec_spi->high_pri_worker->task); + + return 0; } static int cros_ec_spi_probe(struct spi_device *spi) diff --git a/drivers/powercap/idle_inject.c b/drivers/powercap/idle_inject.c index c90f0990968b..bc396a8231bf 100644 --- a/drivers/powercap/idle_inject.c +++ b/drivers/powercap/idle_inject.c @@ -268,9 +268,7 @@ void idle_inject_stop(struct idle_inject_device *ii_dev) */ static void idle_inject_setup(unsigned int cpu) { - struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; - - sched_setscheduler(current, SCHED_FIFO, ¶m); + sched_set_fifo(current); } /** diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 8158e281f354..5a4f0bfce474 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -1592,11 +1592,9 @@ EXPORT_SYMBOL_GPL(spi_take_timestamp_post); */ static void spi_set_thread_rt(struct spi_controller *ctlr) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO / 2 }; - dev_info(&ctlr->dev, "will run message pump with realtime priority\n"); - sched_setscheduler(ctlr->kworker_task, SCHED_FIFO, ¶m); + sched_set_fifo(ctlr->kworker_task); } static int spi_init_queue(struct spi_controller *ctlr) diff --git a/drivers/staging/android/ion/ion_heap.c b/drivers/staging/android/ion/ion_heap.c index 0755b11348ed..563b84cda2f0 100644 --- a/drivers/staging/android/ion/ion_heap.c +++ b/drivers/staging/android/ion/ion_heap.c @@ -244,8 +244,6 @@ static int ion_heap_deferred_free(void *data) int ion_heap_init_deferred_free(struct ion_heap *heap) { - struct sched_param param = { .sched_priority = 0 }; - INIT_LIST_HEAD(&heap->free_list); init_waitqueue_head(&heap->waitqueue); heap->task = kthread_run(ion_heap_deferred_free, heap, @@ -255,7 +253,7 @@ int ion_heap_init_deferred_free(struct ion_heap *heap) __func__); return PTR_ERR_OR_ZERO(heap->task); } - sched_setscheduler(heap->task, SCHED_IDLE, ¶m); + sched_set_normal(heap->task, 19); return 0; } diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index f74b2473440d..b0eb5ece9243 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -70,9 +70,6 @@ static unsigned int control_cpu; /* The cpu assigned to collect stat and update */ static bool clamping; -static const struct sched_param sparam = { - .sched_priority = MAX_USER_RT_PRIO / 2, -}; struct powerclamp_worker_data { struct kthread_worker *worker; struct kthread_work balancing_work; @@ -488,7 +485,7 @@ static void start_power_clamp_worker(unsigned long cpu) w_data->cpu = cpu; w_data->clamping = true; set_bit(cpu, cpu_clamping_mask); - sched_setscheduler(worker->task, SCHED_FIFO, &sparam); + sched_set_fifo(worker->task); kthread_init_work(&w_data->balancing_work, clamp_balancing_func); kthread_init_delayed_work(&w_data->idle_injection_work, clamp_idle_injection_func); diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c index d2e5c6c86643..809610b37c71 100644 --- a/drivers/tty/serial/sc16is7xx.c +++ b/drivers/tty/serial/sc16is7xx.c @@ -1179,7 +1179,6 @@ static int sc16is7xx_probe(struct device *dev, const struct sc16is7xx_devtype *devtype, struct regmap *regmap, int irq) { - struct sched_param sched_param = { .sched_priority = MAX_RT_PRIO / 2 }; unsigned long freq = 0, *pfreq = dev_get_platdata(dev); unsigned int val; u32 uartclk = 0; @@ -1239,7 +1238,7 @@ static int sc16is7xx_probe(struct device *dev, ret = PTR_ERR(s->kworker_task); goto out_clk; } - sched_setscheduler(s->kworker_task, SCHED_FIFO, &sched_param); + sched_set_fifo(s->kworker_task); #ifdef CONFIG_GPIOLIB if (devtype->nr_gpio) { diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 7e4cd34a8c20..b9dc2c352151 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -1144,14 +1144,13 @@ void watchdog_dev_unregister(struct watchdog_device *wdd) int __init watchdog_dev_init(void) { int err; - struct sched_param param = {.sched_priority = MAX_RT_PRIO - 1,}; watchdog_kworker = kthread_create_worker(0, "watchdogd"); if (IS_ERR(watchdog_kworker)) { pr_err("Failed to create watchdog kworker\n"); return PTR_ERR(watchdog_kworker); } - sched_setscheduler(watchdog_kworker->task, SCHED_FIFO, ¶m); + sched_set_fifo(watchdog_kworker->task); err = class_register(&watchdog_class); if (err < 0) { diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index e9e27a271af0..feaa5e182b7b 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -51,6 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file, } else { inode_lock(inode); i_size_write(inode, datasize + sizeof(attributes)); + inode->i_mtime = current_time(inode); inode_unlock(inode); } @@ -72,10 +73,8 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, ssize_t size = 0; int err; - while (!__ratelimit(&file->f_cred->user->ratelimit)) { - if (!msleep_interruptible(50)) - return -EINTR; - } + while (!__ratelimit(&file->f_cred->user->ratelimit)) + msleep(50); err = efivar_entry_size(var, &datasize); diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index ee37256ec8bd..5e55302e3bf6 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -33,6 +33,14 @@ #define __no_sanitize_thread #endif +#if __has_feature(undefined_behavior_sanitizer) +/* GCC does not have __SANITIZE_UNDEFINED__ */ +#define __no_sanitize_undefined \ + __attribute__((no_sanitize("undefined"))) +#else +#define __no_sanitize_undefined +#endif + /* * Not all versions of clang implement the the type-generic versions * of the builtin overflow checkers. Fortunately, clang implements diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7dd4e0349ef3..1c74464c80c6 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -150,6 +150,12 @@ #define __no_sanitize_thread #endif +#if __has_attribute(__no_sanitize_undefined__) +#define __no_sanitize_undefined __attribute__((no_sanitize_undefined)) +#else +#define __no_sanitize_undefined +#endif + #if GCC_VERSION >= 50100 #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index cdf016596659..c8f03d2969df 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -40,6 +40,7 @@ # define __GCC4_has_attribute___noclone__ 1 # define __GCC4_has_attribute___nonstring__ 0 # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) +# define __GCC4_has_attribute___no_sanitize_undefined__ (__GNUC_MINOR__ >= 9) # define __GCC4_has_attribute___fallthrough__ 0 #endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index e368384445b6..c3bf7710f69a 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -118,10 +118,6 @@ struct ftrace_likely_data { #define notrace __attribute__((__no_instrument_function__)) #endif -/* Section for code which can't be instrumented at all */ -#define noinstr \ - noinline notrace __attribute((__section__(".noinstr.text"))) - /* * it doesn't make sense on ARM (currently the only user of __naked) * to trace naked functions because then mcount is called without @@ -193,16 +189,18 @@ struct ftrace_likely_data { #define __no_kcsan __no_sanitize_thread #ifdef __SANITIZE_THREAD__ -# define __no_kcsan_or_inline __no_kcsan notrace __maybe_unused -# define __no_sanitize_or_inline __no_kcsan_or_inline -#else -# define __no_kcsan_or_inline __always_inline +# define __no_sanitize_or_inline __no_kcsan notrace __maybe_unused #endif #ifndef __no_sanitize_or_inline #define __no_sanitize_or_inline __always_inline #endif +/* Section for code which can't be instrumented at all */ +#define noinstr \ + noinline notrace __attribute((__section__(".noinstr.text"))) \ + __no_kcsan __no_sanitize_address + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 52692587f7fe..8aa84c052fdf 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -64,6 +64,7 @@ extern ssize_t cpu_show_tsx_async_abort(struct device *dev, char *buf); extern ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, diff --git a/include/linux/efi.h b/include/linux/efi.h index 2c6495f72f79..bb35f3305e55 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -350,6 +350,7 @@ void efi_native_runtime_setup(void); * associated with ConOut */ #define LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID EFI_GUID(0xe03fc20a, 0x85dc, 0x406e, 0xb9, 0x0e, 0x4a, 0xb5, 0x02, 0x37, 0x1d, 0x95) +#define LINUX_EFI_ARM_CPU_STATE_TABLE_GUID EFI_GUID(0xef79e4aa, 0x3c3d, 0x4989, 0xb9, 0x02, 0x07, 0xa9, 0x43, 0xe5, 0x50, 0xd2) #define LINUX_EFI_LOADER_ENTRY_GUID EFI_GUID(0x4a67b082, 0x0a4c, 0x41cf, 0xb6, 0xc7, 0x44, 0x0b, 0x29, 0xbb, 0x8c, 0x4f) #define LINUX_EFI_RANDOM_SEED_TABLE_GUID EFI_GUID(0x1ce1e5bc, 0x7ceb, 0x42f2, 0x81, 0xe5, 0x8a, 0xad, 0xf1, 0x80, 0xf5, 0x7b) #define LINUX_EFI_TPM_EVENT_LOG_GUID EFI_GUID(0xb7799cb0, 0xeca2, 0x4943, 0x96, 0x67, 0x1f, 0xae, 0x07, 0xb7, 0x47, 0xfa) @@ -1236,14 +1237,11 @@ struct linux_efi_memreserve { struct { phys_addr_t base; phys_addr_t size; - } entry[0]; + } entry[]; }; -#define EFI_MEMRESERVE_SIZE(count) (sizeof(struct linux_efi_memreserve) + \ - (count) * sizeof(((struct linux_efi_memreserve *)0)->entry[0])) - #define EFI_MEMRESERVE_COUNT(size) (((size) - sizeof(struct linux_efi_memreserve)) \ - / sizeof(((struct linux_efi_memreserve *)0)->entry[0])) + / sizeof_field(struct linux_efi_memreserve, entry[0])) void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e339dac91ee6..ce2c06f72e86 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -58,9 +58,6 @@ struct ftrace_direct_func; const char * ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym); -int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *name, - char *module_name, int *exported); #else static inline const char * ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, @@ -68,6 +65,13 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, { return NULL; } +#endif + +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, + char *module_name, int *exported); +#else static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name, char *module_name, int *exported) @@ -76,7 +80,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val } #endif - #ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; @@ -207,6 +210,7 @@ struct ftrace_ops { struct ftrace_ops_hash old_hash; unsigned long trampoline; unsigned long trampoline_size; + struct list_head list; #endif }; diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 2735da5f839e..30823780c192 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -2,7 +2,7 @@ #ifndef _LINUX_IRQ_WORK_H #define _LINUX_IRQ_WORK_H -#include <linux/llist.h> +#include <linux/smp_types.h> /* * An entry can be in one of four states: @@ -13,24 +13,14 @@ * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed */ -/* flags share CSD_FLAG_ space */ - -#define IRQ_WORK_PENDING BIT(0) -#define IRQ_WORK_BUSY BIT(1) - -/* Doesn't want IPI, wait for tick: */ -#define IRQ_WORK_LAZY BIT(2) -/* Run hard IRQ context, even on RT */ -#define IRQ_WORK_HARD_IRQ BIT(3) - -#define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY) - -/* - * structure shares layout with single_call_data_t. - */ struct irq_work { - struct llist_node llnode; - atomic_t flags; + union { + struct __call_single_node node; + struct { + struct llist_node llnode; + atomic_t flags; + }; + }; void (*func)(struct irq_work *); }; diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 6adf90f248d7..45b8cdc9fad7 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -242,6 +242,7 @@ struct kprobe_insn_cache { struct mutex mutex; void *(*alloc)(void); /* allocate insn page */ void (*free)(void *); /* free insn page */ + const char *sym; /* symbol for insn pages */ struct list_head pages; /* list of kprobe_insn_page */ size_t insn_size; /* size of instruction slot */ int nr_garbage; @@ -272,6 +273,10 @@ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ { \ return __is_insn_slot_addr(&kprobe_##__name##_slots, addr); \ } +#define KPROBE_INSN_PAGE_SYM "kprobe_insn_page" +#define KPROBE_OPTINSN_PAGE_SYM "kprobe_optinsn_page" +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, + unsigned long *value, char *type, char *sym); #else /* __ARCH_WANT_KPROBES_INSN_SLOT */ #define DEFINE_INSN_CACHE_OPS(__name) \ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ @@ -377,6 +382,11 @@ void dump_kprobe(struct kprobe *kp); void *alloc_insn_page(void); void free_insn_page(void *page); +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym); + +int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, + char *type, char *sym); #else /* !CONFIG_KPROBES: */ static inline int kprobes_built_in(void) @@ -439,6 +449,11 @@ static inline bool within_kprobe_blacklist(unsigned long addr) { return true; } +static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} #endif /* CONFIG_KPROBES */ static inline int disable_kretprobe(struct kretprobe *rp) { diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 8fce5c98a4b0..3b73cf84f77d 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -10,181 +10,20 @@ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H +#include <linux/lockdep_types.h> + struct task_struct; -struct lockdep_map; /* for sysctl */ extern int prove_locking; extern int lock_stat; -#define MAX_LOCKDEP_SUBCLASSES 8UL - -#include <linux/types.h> - -enum lockdep_wait_type { - LD_WAIT_INV = 0, /* not checked, catch all */ - - LD_WAIT_FREE, /* wait free, rcu etc.. */ - LD_WAIT_SPIN, /* spin loops, raw_spinlock_t etc.. */ - -#ifdef CONFIG_PROVE_RAW_LOCK_NESTING - LD_WAIT_CONFIG, /* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */ -#else - LD_WAIT_CONFIG = LD_WAIT_SPIN, -#endif - LD_WAIT_SLEEP, /* sleeping locks, mutex_t etc.. */ - - LD_WAIT_MAX, /* must be last */ -}; - #ifdef CONFIG_LOCKDEP #include <linux/linkage.h> -#include <linux/list.h> #include <linux/debug_locks.h> #include <linux/stacktrace.h> -/* - * We'd rather not expose kernel/lockdep_states.h this wide, but we do need - * the total number of states... :-( - */ -#define XXX_LOCK_USAGE_STATES (1+2*4) - -/* - * NR_LOCKDEP_CACHING_CLASSES ... Number of classes - * cached in the instance of lockdep_map - * - * Currently main class (subclass == 0) and signle depth subclass - * are cached in lockdep_map. This optimization is mainly targeting - * on rq->lock. double_rq_lock() acquires this highly competitive with - * single depth. - */ -#define NR_LOCKDEP_CACHING_CLASSES 2 - -/* - * A lockdep key is associated with each lock object. For static locks we use - * the lock address itself as the key. Dynamically allocated lock objects can - * have a statically or dynamically allocated key. Dynamically allocated lock - * keys must be registered before being used and must be unregistered before - * the key memory is freed. - */ -struct lockdep_subclass_key { - char __one_byte; -} __attribute__ ((__packed__)); - -/* hash_entry is used to keep track of dynamically allocated keys. */ -struct lock_class_key { - union { - struct hlist_node hash_entry; - struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; - }; -}; - -extern struct lock_class_key __lockdep_no_validate__; - -struct lock_trace; - -#define LOCKSTAT_POINTS 4 - -/* - * The lock-class itself. The order of the structure members matters. - * reinit_class() zeroes the key member and all subsequent members. - */ -struct lock_class { - /* - * class-hash: - */ - struct hlist_node hash_entry; - - /* - * Entry in all_lock_classes when in use. Entry in free_lock_classes - * when not in use. Instances that are being freed are on one of the - * zapped_classes lists. - */ - struct list_head lock_entry; - - /* - * These fields represent a directed graph of lock dependencies, - * to every node we attach a list of "forward" and a list of - * "backward" graph nodes. - */ - struct list_head locks_after, locks_before; - - const struct lockdep_subclass_key *key; - unsigned int subclass; - unsigned int dep_gen_id; - - /* - * IRQ/softirq usage tracking bits: - */ - unsigned long usage_mask; - const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES]; - - /* - * Generation counter, when doing certain classes of graph walking, - * to ensure that we check one node only once: - */ - int name_version; - const char *name; - - short wait_type_inner; - short wait_type_outer; - -#ifdef CONFIG_LOCK_STAT - unsigned long contention_point[LOCKSTAT_POINTS]; - unsigned long contending_point[LOCKSTAT_POINTS]; -#endif -} __no_randomize_layout; - -#ifdef CONFIG_LOCK_STAT -struct lock_time { - s64 min; - s64 max; - s64 total; - unsigned long nr; -}; - -enum bounce_type { - bounce_acquired_write, - bounce_acquired_read, - bounce_contended_write, - bounce_contended_read, - nr_bounce_types, - - bounce_acquired = bounce_acquired_write, - bounce_contended = bounce_contended_write, -}; - -struct lock_class_stats { - unsigned long contention_point[LOCKSTAT_POINTS]; - unsigned long contending_point[LOCKSTAT_POINTS]; - struct lock_time read_waittime; - struct lock_time write_waittime; - struct lock_time read_holdtime; - struct lock_time write_holdtime; - unsigned long bounces[nr_bounce_types]; -}; - -struct lock_class_stats lock_stats(struct lock_class *class); -void clear_lock_stats(struct lock_class *class); -#endif - -/* - * Map the lock object (the lock instance) to the lock-class object. - * This is embedded into specific lock instances: - */ -struct lockdep_map { - struct lock_class_key *key; - struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; - const char *name; - short wait_type_outer; /* can be taken in this context */ - short wait_type_inner; /* presents this context */ -#ifdef CONFIG_LOCK_STAT - int cpu; - unsigned long ip; -#endif -}; - static inline void lockdep_copy_map(struct lockdep_map *to, struct lockdep_map *from) { @@ -440,8 +279,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock, extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); -struct pin_cookie { unsigned int val; }; - #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); @@ -520,10 +357,6 @@ static inline void lockdep_set_selftest_task(struct task_struct *task) # define lockdep_reset() do { debug_locks = 1; } while (0) # define lockdep_free_key_range(start, size) do { } while (0) # define lockdep_sys_exit() do { } while (0) -/* - * The class key takes no space if lockdep is disabled: - */ -struct lock_class_key { }; static inline void lockdep_register_key(struct lock_class_key *key) { @@ -533,11 +366,6 @@ static inline void lockdep_unregister_key(struct lock_class_key *key) { } -/* - * The lockdep_map takes no space if lockdep is disabled: - */ -struct lockdep_map { }; - #define lockdep_depth(tsk) (0) #define lockdep_is_held_type(l, r) (1) @@ -549,8 +377,6 @@ struct lockdep_map { }; #define lockdep_recursing(tsk) (0) -struct pin_cookie { }; - #define NIL_COOKIE (struct pin_cookie){ } #define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; }) diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h new file mode 100644 index 000000000000..7b9350624577 --- /dev/null +++ b/include/linux/lockdep_types.h @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Runtime locking correctness validator + * + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * see Documentation/locking/lockdep-design.rst for more details. + */ +#ifndef __LINUX_LOCKDEP_TYPES_H +#define __LINUX_LOCKDEP_TYPES_H + +#include <linux/types.h> + +#define MAX_LOCKDEP_SUBCLASSES 8UL + +enum lockdep_wait_type { + LD_WAIT_INV = 0, /* not checked, catch all */ + + LD_WAIT_FREE, /* wait free, rcu etc.. */ + LD_WAIT_SPIN, /* spin loops, raw_spinlock_t etc.. */ + +#ifdef CONFIG_PROVE_RAW_LOCK_NESTING + LD_WAIT_CONFIG, /* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */ +#else + LD_WAIT_CONFIG = LD_WAIT_SPIN, +#endif + LD_WAIT_SLEEP, /* sleeping locks, mutex_t etc.. */ + + LD_WAIT_MAX, /* must be last */ +}; + +#ifdef CONFIG_LOCKDEP + +#include <linux/list.h> + +/* + * We'd rather not expose kernel/lockdep_states.h this wide, but we do need + * the total number of states... :-( + */ +#define XXX_LOCK_USAGE_STATES (1+2*4) + +/* + * NR_LOCKDEP_CACHING_CLASSES ... Number of classes + * cached in the instance of lockdep_map + * + * Currently main class (subclass == 0) and signle depth subclass + * are cached in lockdep_map. This optimization is mainly targeting + * on rq->lock. double_rq_lock() acquires this highly competitive with + * single depth. + */ +#define NR_LOCKDEP_CACHING_CLASSES 2 + +/* + * A lockdep key is associated with each lock object. For static locks we use + * the lock address itself as the key. Dynamically allocated lock objects can + * have a statically or dynamically allocated key. Dynamically allocated lock + * keys must be registered before being used and must be unregistered before + * the key memory is freed. + */ +struct lockdep_subclass_key { + char __one_byte; +} __attribute__ ((__packed__)); + +/* hash_entry is used to keep track of dynamically allocated keys. */ +struct lock_class_key { + union { + struct hlist_node hash_entry; + struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; + }; +}; + +extern struct lock_class_key __lockdep_no_validate__; + +struct lock_trace; + +#define LOCKSTAT_POINTS 4 + +/* + * The lock-class itself. The order of the structure members matters. + * reinit_class() zeroes the key member and all subsequent members. + */ +struct lock_class { + /* + * class-hash: + */ + struct hlist_node hash_entry; + + /* + * Entry in all_lock_classes when in use. Entry in free_lock_classes + * when not in use. Instances that are being freed are on one of the + * zapped_classes lists. + */ + struct list_head lock_entry; + + /* + * These fields represent a directed graph of lock dependencies, + * to every node we attach a list of "forward" and a list of + * "backward" graph nodes. + */ + struct list_head locks_after, locks_before; + + const struct lockdep_subclass_key *key; + unsigned int subclass; + unsigned int dep_gen_id; + + /* + * IRQ/softirq usage tracking bits: + */ + unsigned long usage_mask; + const struct lock_trace *usage_traces[XXX_LOCK_USAGE_STATES]; + + /* + * Generation counter, when doing certain classes of graph walking, + * to ensure that we check one node only once: + */ + int name_version; + const char *name; + + short wait_type_inner; + short wait_type_outer; + +#ifdef CONFIG_LOCK_STAT + unsigned long contention_point[LOCKSTAT_POINTS]; + unsigned long contending_point[LOCKSTAT_POINTS]; +#endif +} __no_randomize_layout; + +#ifdef CONFIG_LOCK_STAT +struct lock_time { + s64 min; + s64 max; + s64 total; + unsigned long nr; +}; + +enum bounce_type { + bounce_acquired_write, + bounce_acquired_read, + bounce_contended_write, + bounce_contended_read, + nr_bounce_types, + + bounce_acquired = bounce_acquired_write, + bounce_contended = bounce_contended_write, +}; + +struct lock_class_stats { + unsigned long contention_point[LOCKSTAT_POINTS]; + unsigned long contending_point[LOCKSTAT_POINTS]; + struct lock_time read_waittime; + struct lock_time write_waittime; + struct lock_time read_holdtime; + struct lock_time write_holdtime; + unsigned long bounces[nr_bounce_types]; +}; + +struct lock_class_stats lock_stats(struct lock_class *class); +void clear_lock_stats(struct lock_class *class); +#endif + +/* + * Map the lock object (the lock instance) to the lock-class object. + * This is embedded into specific lock instances: + */ +struct lockdep_map { + struct lock_class_key *key; + struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; + const char *name; + short wait_type_outer; /* can be taken in this context */ + short wait_type_inner; /* presents this context */ +#ifdef CONFIG_LOCK_STAT + int cpu; + unsigned long ip; +#endif +}; + +struct pin_cookie { unsigned int val; }; + +#else /* !CONFIG_LOCKDEP */ + +/* + * The class key takes no space if lockdep is disabled: + */ +struct lock_class_key { }; + +/* + * The lockdep_map takes no space if lockdep is disabled: + */ +struct lockdep_map { }; + +struct pin_cookie { }; + +#endif /* !LOCKDEP */ + +#endif /* __LINUX_LOCKDEP_TYPES_H */ diff --git a/include/linux/math64.h b/include/linux/math64.h index 11a267413e8e..d097119419e6 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -263,6 +263,8 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) } #endif /* mul_u64_u32_div */ +u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); + #define DIV64_U64_ROUND_UP(ll, d) \ ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b4bb32082342..46fe5cfb5163 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1232,6 +1232,9 @@ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); +extern void perf_event_text_poke(const void *addr, + const void *old_bytes, size_t old_len, + const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); @@ -1479,6 +1482,11 @@ static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } +static inline void perf_event_text_poke(const void *addr, + const void *old_bytes, + size_t old_len, + const void *new_bytes, + size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 4b7258495a04..b95f3211566a 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -153,9 +153,10 @@ struct psi_group { unsigned long avg[NR_PSI_STATES - 1][3]; /* Monitor work control */ - atomic_t poll_scheduled; - struct kthread_worker __rcu *poll_kworker; - struct kthread_delayed_work poll_work; + struct task_struct __rcu *poll_task; + struct timer_list poll_timer; + wait_queue_head_t poll_wait; + atomic_t poll_wakeup; /* Protects data used by the monitor */ struct mutex trigger_lock; diff --git a/include/linux/sched.h b/include/linux/sched.h index 224b5de568e7..9bd073a10224 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -654,11 +654,8 @@ struct task_struct { unsigned int ptrace; #ifdef CONFIG_SMP - struct { - struct llist_node wake_entry; - unsigned int wake_entry_type; - }; int on_cpu; + struct __call_single_node wake_entry; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ unsigned int cpu; @@ -1655,6 +1652,9 @@ extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); +extern void sched_set_fifo(struct task_struct *p); +extern void sched_set_fifo_low(struct task_struct *p); +extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index 0fbcbacd1b29..cc9f393e2a70 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -14,6 +14,7 @@ enum hk_flags { HK_FLAG_DOMAIN = (1 << 5), HK_FLAG_WQ = (1 << 6), HK_FLAG_MANAGED_IRQ = (1 << 7), + HK_FLAG_KTHREAD = (1 << 8), }; #ifdef CONFIG_CPU_ISOLATION diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 660ac49f2b53..24be30a40814 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -61,6 +61,9 @@ int sched_proc_update_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +extern unsigned int sysctl_sched_dl_period_max; +extern unsigned int sysctl_sched_dl_period_min; + #ifdef CONFIG_UCLAMP_TASK extern unsigned int sysctl_sched_uclamp_util_min; extern unsigned int sysctl_sched_uclamp_util_max; diff --git a/include/linux/smp.h b/include/linux/smp.h index 7ee202ad21a6..80d557ef8a11 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -12,29 +12,22 @@ #include <linux/list.h> #include <linux/cpumask.h> #include <linux/init.h> -#include <linux/llist.h> +#include <linux/smp_types.h> typedef void (*smp_call_func_t)(void *info); typedef bool (*smp_cond_func_t)(int cpu, void *info); -enum { - CSD_FLAG_LOCK = 0x01, - - /* IRQ_WORK_flags */ - - CSD_TYPE_ASYNC = 0x00, - CSD_TYPE_SYNC = 0x10, - CSD_TYPE_IRQ_WORK = 0x20, - CSD_TYPE_TTWU = 0x30, - CSD_FLAG_TYPE_MASK = 0xF0, -}; - /* * structure shares (partial) layout with struct irq_work */ struct __call_single_data { - struct llist_node llist; - unsigned int flags; + union { + struct __call_single_node node; + struct { + struct llist_node llist; + unsigned int flags; + }; + }; smp_call_func_t func; void *info; }; diff --git a/include/linux/smp_types.h b/include/linux/smp_types.h new file mode 100644 index 000000000000..364b3ae3e41d --- /dev/null +++ b/include/linux/smp_types.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_SMP_TYPES_H +#define __LINUX_SMP_TYPES_H + +#include <linux/llist.h> + +enum { + CSD_FLAG_LOCK = 0x01, + + IRQ_WORK_PENDING = 0x01, + IRQ_WORK_BUSY = 0x02, + IRQ_WORK_LAZY = 0x04, /* No IPI, wait for tick */ + IRQ_WORK_HARD_IRQ = 0x08, /* IRQ context on PREEMPT_RT */ + + IRQ_WORK_CLAIMED = (IRQ_WORK_PENDING | IRQ_WORK_BUSY), + + CSD_TYPE_ASYNC = 0x00, + CSD_TYPE_SYNC = 0x10, + CSD_TYPE_IRQ_WORK = 0x20, + CSD_TYPE_TTWU = 0x30, + + CSD_FLAG_TYPE_MASK = 0xF0, +}; + +/* + * struct __call_single_node is the primary type on + * smp.c:call_single_queue. + * + * flush_smp_call_function_queue() only reads the type from + * __call_single_node::u_flags as a regular load, the above + * (anonymous) enum defines all the bits of this word. + * + * Other bits are not modified until the type is known. + * + * CSD_TYPE_SYNC/ASYNC: + * struct { + * struct llist_node node; + * unsigned int flags; + * smp_call_func_t func; + * void *info; + * }; + * + * CSD_TYPE_IRQ_WORK: + * struct { + * struct llist_node node; + * atomic_t flags; + * void (*func)(struct irq_work *); + * }; + * + * CSD_TYPE_TTWU: + * struct { + * struct llist_node node; + * unsigned int flags; + * }; + * + */ + +struct __call_single_node { + struct llist_node llist; + union { + unsigned int u_flags; + atomic_t a_flags; + }; +}; + +#endif /* __LINUX_SMP_TYPES_H */ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index d3770b3f9d9a..f2f12d746dbd 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -56,6 +56,7 @@ #include <linux/kernel.h> #include <linux/stringify.h> #include <linux/bottom_half.h> +#include <linux/lockdep.h> #include <asm/barrier.h> #include <asm/mmiowb.h> diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index 6102e6bff3ae..b981caafe8bf 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -15,7 +15,7 @@ # include <linux/spinlock_types_up.h> #endif -#include <linux/lockdep.h> +#include <linux/lockdep_types.h> typedef struct raw_spinlock { arch_spinlock_t raw_lock; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 7c354c2955f5..b951a87da987 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1360,7 +1360,7 @@ static inline long ksys_lchown(const char __user *filename, uid_t user, extern long do_sys_ftruncate(unsigned int fd, loff_t length, int small); -static inline long ksys_ftruncate(unsigned int fd, unsigned long length) +static inline long ksys_ftruncate(unsigned int fd, loff_t length) { return do_sys_ftruncate(fd, length, 1); } diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index b27e2ffa96c1..d5471d6fa778 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -222,9 +222,9 @@ extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); -/* +/** * struct system_time_snapshot - simultaneous raw/real time capture with - * counter value + * counter value * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @raw: Monotonic raw system time @@ -239,9 +239,9 @@ struct system_time_snapshot { u8 cs_was_changed_seq; }; -/* +/** * struct system_device_crosststamp - system/device cross-timestamp - * (syncronized capture) + * (synchronized capture) * @device: Device time * @sys_realtime: Realtime simultaneous with device time * @sys_monoraw: Monotonic raw simultaneous with device time @@ -252,12 +252,12 @@ struct system_device_crosststamp { ktime_t sys_monoraw; }; -/* +/** * struct system_counterval_t - system counter value with the pointer to the - * corresponding clocksource + * corresponding clocksource * @cycles: System counter value * @cs: Clocksource corresponding to system counter value. Used by - * timekeeping code to verify comparibility of two cycle values + * timekeeping code to verify comparibility of two cycle values */ struct system_counterval_t { u64 cycles; diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index 4f8c90c93c29..64356b199e94 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -81,6 +81,8 @@ struct tcg_efi_specid_event_algs { u16 digest_size; } __packed; +#define TCG_SPECID_SIG "Spec ID Event03" + struct tcg_efi_specid_event_head { u8 signature[16]; u32 platform_class; @@ -171,6 +173,7 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, int i; int j; u32 count, event_type; + const u8 zero_digest[sizeof(event_header->digest)] = {0}; marker = event; marker_start = marker; @@ -198,10 +201,19 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, count = READ_ONCE(event->count); event_type = READ_ONCE(event->event_type); + /* Verify that it's the log header */ + if (event_header->pcr_idx != 0 || + event_header->event_type != NO_ACTION || + memcmp(event_header->digest, zero_digest, sizeof(zero_digest))) { + size = 0; + goto out; + } + efispecid = (struct tcg_efi_specid_event_head *)event_header->event; /* Check if event is malformed. */ - if (count > efispecid->num_algs) { + if (memcmp(efispecid->signature, TCG_SPECID_SIG, + sizeof(TCG_SPECID_SIG)) || count > efispecid->num_algs) { size = 0; goto out; } diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ed168b0e2c53..04f9a4c7b0d9 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -634,6 +634,14 @@ DECLARE_TRACE(sched_overutilized_tp, TP_PROTO(struct root_domain *rd, bool overutilized), TP_ARGS(rd, overutilized)); +DECLARE_TRACE(sched_util_est_cfs_tp, + TP_PROTO(struct cfs_rq *cfs_rq), + TP_ARGS(cfs_rq)); + +DECLARE_TRACE(sched_util_est_se_tp, + TP_PROTO(struct sched_entity *se), + TP_ARGS(se)); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7b2d6fc9e6ed..52ca2093831c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -383,7 +383,8 @@ struct perf_event_attr { bpf_event : 1, /* include bpf events */ aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ - __reserved_1 : 31; + text_poke : 1, /* include text poke events */ + __reserved_1 : 30; union { __u32 wakeup_events; /* wakeup every n events */ @@ -1024,12 +1025,35 @@ enum perf_event_type { */ PERF_RECORD_CGROUP = 19, + /* + * Records changes to kernel text i.e. self-modified code. 'old_len' is + * the number of old bytes, 'new_len' is the number of new bytes. Either + * 'old_len' or 'new_len' may be zero to indicate, for example, the + * addition or removal of a trampoline. 'bytes' contains the old bytes + * followed immediately by the new bytes. + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u16 old_len; + * u16 new_len; + * u8 bytes[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_TEXT_POKE = 20, + PERF_RECORD_MAX, /* non-ABI */ }; enum perf_record_ksymbol_type { PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + /* + * Out of line code such as kprobe-replaced instructions or optimized + * kprobes or ftrace trampolines. + */ + PERF_RECORD_KSYMBOL_TYPE_OOL = 2, PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ }; diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index d72beda824aa..53314d7da4be 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -45,11 +45,6 @@ static int __init early_initrdmem(char *p) } early_param("initrdmem", early_initrdmem); -/* - * This is here as the initrd keyword has been in use since 11/2018 - * on ARM, PowerPC, and MIPS. - * It should not be; it is reserved for bootloaders. - */ static int __init early_initrd(char *p) { return early_initrdmem(p); diff --git a/kernel/events/core.c b/kernel/events/core.c index 856d98c36f56..9b8f92500833 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; +static atomic_t nr_text_poke_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4575,7 +4576,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || attr->task || attr->ksymbol || - attr->context_switch || + attr->context_switch || attr->text_poke || attr->bpf_event) return true; return false; @@ -4651,6 +4652,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_dec(&nr_bpf_events); + if (event->attr.text_poke) + atomic_dec(&nr_text_poke_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -8628,6 +8631,89 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_text_poke_event { + const void *old_bytes; + const void *new_bytes; + size_t pad; + u16 old_len; + u16 new_len; + + struct { + struct perf_event_header header; + + u64 addr; + } event_id; +}; + +static int perf_event_text_poke_match(struct perf_event *event) +{ + return event->attr.text_poke; +} + +static void perf_event_text_poke_output(struct perf_event *event, void *data) +{ + struct perf_text_poke_event *text_poke_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + u64 padding = 0; + int ret; + + if (!perf_event_text_poke_match(event)) + return; + + perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, text_poke_event->event_id); + perf_output_put(&handle, text_poke_event->old_len); + perf_output_put(&handle, text_poke_event->new_len); + + __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len); + __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len); + + if (text_poke_event->pad) + __output_copy(&handle, &padding, text_poke_event->pad); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_text_poke(const void *addr, const void *old_bytes, + size_t old_len, const void *new_bytes, size_t new_len) +{ + struct perf_text_poke_event text_poke_event; + size_t tot, pad; + + if (!atomic_read(&nr_text_poke_events)) + return; + + tot = sizeof(text_poke_event.old_len) + old_len; + tot += sizeof(text_poke_event.new_len) + new_len; + pad = ALIGN(tot, sizeof(u64)) - tot; + + text_poke_event = (struct perf_text_poke_event){ + .old_bytes = old_bytes, + .new_bytes = new_bytes, + .pad = pad, + .old_len = old_len, + .new_len = new_len, + .event_id = { + .header = { + .type = PERF_RECORD_TEXT_POKE, + .misc = PERF_RECORD_MISC_KERNEL, + .size = sizeof(text_poke_event.event_id) + tot + pad, + }, + .addr = (unsigned long)addr, + }, + }; + + perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -10945,6 +11031,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_ksymbol_events); if (event->attr.bpf_event) atomic_inc(&nr_bpf_events); + if (event->attr.text_poke) + atomic_inc(&nr_text_poke_events); if (inc) { /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 761911168438..06b62746e1df 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1271,9 +1271,6 @@ static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { struct task_struct *t; - struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; if (!secondary) { t = kthread_create(irq_thread, new, "irq/%d-%s", irq, @@ -1281,13 +1278,12 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) } else { t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, new->name); - param.sched_priority -= 1; } if (IS_ERR(t)) return PTR_ERR(t); - sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); + sched_set_fifo(t); /* * We keep the reference to the task struct even if diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..834bfdc43235 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/filter.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include <linux/compiler.h> /* @@ -437,6 +438,7 @@ struct kallsym_iter { loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; + loff_t pos_bpf_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; @@ -480,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) return 1; } +/* + * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace + * purposes. In that case "__builtin__ftrace" is used as a module name, even + * though "__builtin__ftrace" is not a module. + */ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, @@ -496,11 +503,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) static int get_ksymbol_bpf(struct kallsym_iter *iter) { + int ret; + strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; - return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, - &iter->value, &iter->type, - iter->name) < 0 ? 0 : 1; + ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, + &iter->value, &iter->type, + iter->name); + if (ret < 0) { + iter->pos_bpf_end = iter->pos; + return 0; + } + + return 1; +} + +/* + * This uses "__builtin__kprobes" as a module name for symbols for pages + * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a + * module. + */ +static int get_ksymbol_kprobe(struct kallsym_iter *iter) +{ + strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN); + iter->exported = 0; + return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end, + &iter->value, &iter->type, + iter->name) < 0 ? 0 : 1; } /* Returns space to next name. */ @@ -527,6 +556,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; + iter->pos_bpf_end = 0; } } @@ -551,7 +581,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) get_ksymbol_ftrace_mod(iter)) return 1; - return get_ksymbol_bpf(iter); + if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) && + get_ksymbol_bpf(iter)) + return 1; + + return get_ksymbol_kprobe(iter); } /* Returns false if pos at or past end of file. */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4a904cc56d68..0924397d9d59 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,6 +35,7 @@ #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/jump_label.h> +#include <linux/perf_event.h> #include <asm/sections.h> #include <asm/cacheflush.h> @@ -123,6 +124,7 @@ struct kprobe_insn_cache kprobe_insn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_INSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -188,6 +190,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->cache = c; list_add_rcu(&kip->list, &c->pages); slot = kip->insns; + + /* Record the perf ksymbol register event after adding the page */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns, + PAGE_SIZE, false, c->sym); out: mutex_unlock(&c->mutex); return slot; @@ -206,6 +212,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { + /* + * Record perf ksymbol unregister event before removing + * the page. + */ + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + (unsigned long)kip->insns, PAGE_SIZE, true, + kip->cache->sym); list_del_rcu(&kip->list); synchronize_rcu(); kip->cache->free(kip->insns); @@ -295,12 +308,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr) return ret; } +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, + unsigned long *value, char *type, char *sym) +{ + struct kprobe_insn_page *kip; + int ret = -ERANGE; + + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if ((*symnum)--) + continue; + strlcpy(sym, c->sym, KSYM_NAME_LEN); + *type = 't'; + *value = (unsigned long)kip->insns; + ret = 0; + break; + } + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), .alloc = alloc_insn_page, .free = free_insn_page, + .sym = KPROBE_OPTINSN_PAGE_SYM, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, @@ -2232,6 +2267,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry) kprobe_remove_area_blacklist(entry, entry + 1); } +int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, + char *type, char *sym) +{ + return -ERANGE; +} + +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *sym) +{ +#ifdef __ARCH_WANT_KPROBES_INSN_SLOT + if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym)) + return 0; +#ifdef CONFIG_OPTPROBES + if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym)) + return 0; +#endif +#endif + if (!arch_kprobe_get_kallsym(&symnum, value, type, sym)) + return 0; + return -ERANGE; +} + int __init __weak arch_populate_kprobe_blacklist(void) { return 0; diff --git a/kernel/kthread.c b/kernel/kthread.c index 132f84a5fde3..1d9e2fdfd67a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -27,6 +27,7 @@ #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> +#include <linux/sched/isolation.h> #include <trace/events/sched.h> @@ -383,7 +384,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, cpu_all_mask); + set_cpus_allowed_ptr(task, + housekeeping_cpumask(HK_FLAG_KTHREAD)); } kfree(create); return task; @@ -608,7 +610,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, cpu_all_mask); + set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 5efbfc68ce99..fca41a7ded0c 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -436,8 +436,6 @@ static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) static void torture_rtmutex_boost(struct torture_random_state *trsp) { - int policy; - struct sched_param param; const unsigned int factor = 50000; /* yes, quite arbitrary */ if (!rt_task(current)) { @@ -448,8 +446,7 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) */ if (trsp && !(torture_random(trsp) % (cxt.nrealwriters_stress * factor))) { - policy = SCHED_FIFO; - param.sched_priority = MAX_RT_PRIO - 1; + sched_set_fifo(current); } else /* common case, do nothing */ return; } else { @@ -462,13 +459,10 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp) */ if (!trsp || !(torture_random(trsp) % (cxt.nrealwriters_stress * factor * 2))) { - policy = SCHED_NORMAL; - param.sched_priority = 0; + sched_set_normal(current, 0); } else /* common case, do nothing */ return; } - - sched_setscheduler_nocheck(current, policy, ¶m); } static void torture_rtmutex_delay(struct torture_random_state *trsp) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 16dd1e6b7c09..0d4f475a7b2e 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -354,7 +354,6 @@ rcu_perf_writer(void *arg) int i_max; long me = (long)arg; struct rcu_head *rhp = NULL; - struct sched_param sp; bool started = false, done = false, alldone = false; u64 t; u64 *wdp; @@ -363,8 +362,7 @@ rcu_perf_writer(void *arg) VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); - sp.sched_priority = 1; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + sched_set_fifo_low(current); if (holdoff) schedule_timeout_uninterruptible(holdoff * HZ); @@ -420,9 +418,7 @@ retry: started = true; if (!done && i >= MIN_MEAS) { done = true; - sp.sched_priority = 0; - sched_setscheduler_nocheck(current, - SCHED_NORMAL, &sp); + sched_set_normal(current, 0); pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n", perf_type, PERF_FLAG, me, MIN_MEAS); if (atomic_inc_return(&n_rcu_perf_writer_finished) >= @@ -723,7 +719,7 @@ kfree_perf_init(void) schedule_timeout_uninterruptible(1); } - pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj)); + pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj)); kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efb792e13fca..b4c1146de414 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -889,16 +889,11 @@ static int rcu_torture_boost(void *arg) unsigned long endtime; unsigned long oldstarttime; struct rcu_boost_inflight rbi = { .inflight = 0 }; - struct sched_param sp; VERBOSE_TOROUT_STRING("rcu_torture_boost started"); /* Set real-time priority. */ - sp.sched_priority = 1; - if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { - VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!"); - n_rcu_torture_boost_rterror++; - } + sched_set_fifo_low(current); init_rcu_head_on_stack(&rbi.rcu); /* Each pass through the following loop does one boost-test cycle. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8f360326861e..53c482da7527 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -36,6 +36,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -1637,7 +1639,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(p->cpus_ptr, new_mask)) + if (cpumask_equal(&p->cpus_mask, new_mask)) goto out; /* @@ -2293,8 +2295,15 @@ void sched_ttwu_pending(void *arg) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { + if (WARN_ON_ONCE(p->on_cpu)) + smp_cond_load_acquire(&p->on_cpu, !VAL); + + if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) + set_task_cpu(p, cpu_of(rq)); + ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); + } rq_unlock_irqrestore(rq, &rf); } @@ -2322,7 +2331,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); - __smp_call_single_queue(cpu, &p->wake_entry); + __smp_call_single_queue(cpu, &p->wake_entry.llist); } void wake_up_if_idle(int cpu) @@ -2369,7 +2378,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) * the soon-to-be-idle CPU as the current CPU is likely busy. * nr_running is checked to avoid unnecessary task stacking. */ - if ((wake_flags & WF_ON_RQ) && cpu_rq(cpu)->nr_running <= 1) + if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) return true; return false; @@ -2378,6 +2387,9 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { + if (WARN_ON_ONCE(cpu == smp_processor_id())) + return false; + sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; @@ -2528,7 +2540,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) goto out; success = 1; - cpu = task_cpu(p); trace_sched_waking(p); p->state = TASK_RUNNING; trace_sched_wakeup(p); @@ -2550,7 +2561,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* We're going to change ->state: */ success = 1; - cpu = task_cpu(p); /* * Ensure we load p->on_rq _after_ p->state, otherwise it would @@ -2614,8 +2624,21 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * which potentially sends an IPI instead of spinning on p->on_cpu to * let the waker make forward progress. This is safe because IRQs are * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. */ - if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ)) + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) goto unlock; /* @@ -2635,6 +2658,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } +#else + cpu = task_cpu(p); #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2642,7 +2667,7 @@ unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); out: if (success) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, task_cpu(p), wake_flags); preempt_enable(); return success; @@ -2763,7 +2788,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP - p->wake_entry_type = CSD_TYPE_TTWU; + p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif } @@ -4533,7 +4558,8 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) */ if (dl_prio(prio)) { if (!dl_prio(p->normal_prio) || - (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; queue_flag |= ENQUEUE_REPLENISH; } else @@ -5122,6 +5148,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy, * @policy: new policy. * @param: structure containing the new RT priority. * + * Use sched_set_fifo(), read its comment. + * * Return: 0 on success. An error code otherwise. * * NOTE that the task may be already dead. @@ -5164,6 +5192,51 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, } EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); +/* + * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally + * incapable of resource management, which is the one thing an OS really should + * be doing. + * + * This is of course the reason it is limited to privileged users only. + * + * Worse still; it is fundamentally impossible to compose static priority + * workloads. You cannot take two correctly working static prio workloads + * and smash them together and still expect them to work. + * + * For this reason 'all' FIFO tasks the kernel creates are basically at: + * + * MAX_RT_PRIO / 2 + * + * The administrator _MUST_ configure the system, the kernel simply doesn't + * know enough information to make a sensible choice. + */ +void sched_set_fifo(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo); + +/* + * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. + */ +void sched_set_fifo_low(struct task_struct *p) +{ + struct sched_param sp = { .sched_priority = 1 }; + WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_fifo_low); + +void sched_set_normal(struct task_struct *p, int nice) +{ + struct sched_attr attr = { + .sched_policy = SCHED_NORMAL, + .sched_nice = nice, + }; + WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); +} +EXPORT_SYMBOL_GPL(sched_set_normal); + static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { @@ -5810,7 +5883,7 @@ again: if (task_running(p_rq, p) || p->state) goto out_unlock; - yielded = curr->sched_class->yield_to_task(rq, p, preempt); + yielded = curr->sched_class->yield_to_task(rq, p); if (yielded) { schedstat_inc(rq->yld_count); /* diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5cc4012572ec..8cb06c8c7eb1 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -121,6 +121,30 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (later_mask && cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { + unsigned long cap, max_cap = 0; + int cpu, max_cpu = -1; + + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 1; + + /* Ensure the capacity of the CPUs fits the task. */ + for_each_cpu(cpu, later_mask) { + if (!dl_task_fits_capacity(p, cpu)) { + cpumask_clear_cpu(cpu, later_mask); + + cap = capacity_orig_of(cpu); + + if (cap > max_cap || + (cpu == task_cpu(p) && cap == max_cap)) { + max_cap = cap; + max_cpu = cpu; + } + } + } + + if (cpumask_empty(later_mask)) + cpumask_set_cpu(max_cpu, later_mask); + return 1; } else { int best_cpu = cpudl_maximum(cp); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ff9435dee1df..5a55d2300452 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -520,50 +520,6 @@ void account_idle_ticks(unsigned long ticks) } /* - * Perform (stime * rtime) / total, but avoid multiplication overflow by - * losing precision when the numbers are big. - */ -static u64 scale_stime(u64 stime, u64 rtime, u64 total) -{ - u64 scaled; - - for (;;) { - /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) - swap(rtime, stime); - - /* Make sure 'total' fits in 32 bits */ - if (total >> 32) - goto drop_precision; - - /* Does rtime (and thus stime) fit in 32 bits? */ - if (!(rtime >> 32)) - break; - - /* Can we just balance rtime/stime rather than dropping bits? */ - if (stime >> 31) - goto drop_precision; - - /* We can grow stime and shrink rtime and try to make them both fit */ - stime <<= 1; - rtime >>= 1; - continue; - -drop_precision: - /* We drop from rtime, it has more bits than stime */ - rtime >>= 1; - total >>= 1; - } - - /* - * Make sure gcc understands that this is a 32x32->64 multiply, - * followed by a 64/32->64 divide. - */ - scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); - return scaled; -} - -/* * Adjust tick based cputime random precision against scheduler runtime * accounting. * @@ -622,7 +578,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, goto update; } - stime = scale_stime(stime, rtime, stime + utime); + stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); update: /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 504d2f51b0d6..2e1483f0a298 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -54,15 +54,49 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus = 0; + int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); + + if (cpumask_subset(rd->span, cpu_active_mask)) + return cpumask_weight(rd->span); + + cpus = 0; + for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; return cpus; } + +static inline unsigned long __dl_bw_capacity(int i) +{ + struct root_domain *rd = cpu_rq(i)->rd; + unsigned long cap = 0; + + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + for_each_cpu_and(i, rd->span, cpu_active_mask) + cap += capacity_orig_of(i); + + return cap; +} + +/* + * XXX Fix: If 'rq->rd == def_root_domain' perform AC against capacity + * of the CPU the task is running on rather rd's \Sum CPU capacity. + */ +static inline unsigned long dl_bw_capacity(int i) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity) && + capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { + return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; + } else { + return __dl_bw_capacity(i); + } +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -73,6 +107,11 @@ static inline int dl_bw_cpus(int i) { return 1; } + +static inline unsigned long dl_bw_capacity(int i) +{ + return SCHED_CAPACITY_SCALE; +} #endif static inline @@ -1098,7 +1137,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) * cannot use the runtime, and so it replenishes the task. This rule * works fine for implicit deadline tasks (deadline == period), and the * CBS was designed for implicit deadline tasks. However, a task with - * constrained deadline (deadine < period) might be awakened after the + * constrained deadline (deadline < period) might be awakened after the * deadline, but before the next period. In this case, replenishing the * task would allow it to run for runtime / deadline. As in this case * deadline < period, CBS enables a task to run for more than the @@ -1604,6 +1643,7 @@ static int select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; + bool select_rq; struct rq *rq; if (sd_flag != SD_BALANCE_WAKE) @@ -1623,10 +1663,19 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) * other hand, if it has a shorter deadline, we * try to make it stay here, it might be important. */ - if (unlikely(dl_task(curr)) && - (curr->nr_cpus_allowed < 2 || - !dl_entity_preempt(&p->dl, &curr->dl)) && - (p->nr_cpus_allowed > 1)) { + select_rq = unlikely(dl_task(curr)) && + (curr->nr_cpus_allowed < 2 || + !dl_entity_preempt(&p->dl, &curr->dl)) && + p->nr_cpus_allowed > 1; + + /* + * Take the capacity of the CPU into account to + * ensure it fits the requirement of the task. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + select_rq |= !dl_task_fits_capacity(p, cpu); + + if (select_rq) { int target = find_later_rq(p); if (target != -1 && @@ -2551,11 +2600,12 @@ void sched_dl_do_global(void) int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr) { - struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); u64 period = attr->sched_period ?: attr->sched_deadline; u64 runtime = attr->sched_runtime; u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; - int cpus, err = -1; + int cpus, err = -1, cpu = task_cpu(p); + struct dl_bw *dl_b = dl_bw_of(cpu); + unsigned long cap; if (attr->sched_flags & SCHED_FLAG_SUGOV) return 0; @@ -2570,15 +2620,17 @@ int sched_dl_overflow(struct task_struct *p, int policy, * allocated bandwidth of the container. */ raw_spin_lock(&dl_b->lock); - cpus = dl_bw_cpus(task_cpu(p)); + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + if (dl_policy(policy) && !task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, 0, new_bw)) { + !__dl_overflow(dl_b, cap, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && - !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { + !__dl_overflow(dl_b, cap, p->dl.dl_bw, new_bw)) { /* * XXX this is slightly incorrect: when the task * utilization decreases, we should delay the total @@ -2635,6 +2687,14 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) } /* + * Default limits for DL period; on the top end we guard against small util + * tasks still getting rediculous long effective runtimes, on the bottom end we + * guard against timer DoS. + */ +unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */ +unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */ + +/* * This function validates the new parameters of a -deadline task. * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or @@ -2646,6 +2706,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + u64 period, max, min; + /* special dl tasks don't actually use any parameter */ if (attr->sched_flags & SCHED_FLAG_SUGOV) return true; @@ -2669,12 +2731,21 @@ bool __checkparam_dl(const struct sched_attr *attr) attr->sched_period & (1ULL << 63)) return false; + period = attr->sched_period; + if (!period) + period = attr->sched_deadline; + /* runtime <= deadline <= period (if period != 0) */ - if ((attr->sched_period != 0 && - attr->sched_period < attr->sched_deadline) || + if (period < attr->sched_deadline || attr->sched_deadline < attr->sched_runtime) return false; + max = (u64)READ_ONCE(sysctl_sched_dl_period_max) * NSEC_PER_USEC; + min = (u64)READ_ONCE(sysctl_sched_dl_period_min) * NSEC_PER_USEC; + + if (period < min || period > max) + return false; + return true; } @@ -2692,6 +2763,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; + dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; @@ -2714,19 +2786,19 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) #ifdef CONFIG_SMP int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) { + unsigned long flags, cap; unsigned int dest_cpu; struct dl_bw *dl_b; bool overflow; - int cpus, ret; - unsigned long flags; + int ret; dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); rcu_read_lock_sched(); dl_b = dl_bw_of(dest_cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(dest_cpu); - overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + cap = dl_bw_capacity(dest_cpu); + overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw); if (overflow) { ret = -EBUSY; } else { @@ -2736,6 +2808,8 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo * We will free resources in the source root_domain * later on (see set_cpus_allowed_dl()). */ + int cpus = dl_bw_cpus(dest_cpu); + __dl_add(dl_b, p->dl.dl_bw, cpus); ret = 0; } @@ -2768,16 +2842,15 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, bool dl_cpu_busy(unsigned int cpu) { - unsigned long flags; + unsigned long flags, cap; struct dl_bw *dl_b; bool overflow; - int cpus; rcu_read_lock_sched(); dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); + cap = dl_bw_capacity(cpu); + overflow = __dl_overflow(dl_b, cap, 0, 0); raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cbcb2f71599b..0424a0af5f87 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3094,7 +3094,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, #ifdef CONFIG_SMP do { - u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib; + u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); @@ -3440,16 +3440,18 @@ static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider; /* Nothing to update */ if (!delta) return; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; se->avg.util_sum = se->avg.util_avg * divider; @@ -3463,16 +3465,18 @@ static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider; /* Nothing to update */ if (!delta) return; + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; se->avg.runnable_sum = se->avg.runnable_avg * divider; @@ -3500,7 +3504,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + divider = get_pelt_divider(&cfs_rq->avg); if (runnable_sum >= 0) { /* @@ -3646,7 +3650,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (cfs_rq->removed.nr) { unsigned long r; - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); raw_spin_lock(&cfs_rq->removed.lock); swap(cfs_rq->removed.util_avg, removed_util); @@ -3701,7 +3705,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. * See ___update_load_avg() for details. */ - u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib; + u32 divider = get_pelt_divider(&cfs_rq->avg); /* * When we attach the @se to the @cfs_rq, we must align the decay @@ -3922,6 +3926,8 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, enqueued = cfs_rq->avg.util_est.enqueued; enqueued += _task_util_est(p); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); } /* @@ -3952,6 +3958,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + trace_sched_util_est_cfs_tp(cfs_rq); + /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -4017,6 +4025,8 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; done: WRITE_ONCE(p->se.avg.util_est, ue); + + trace_sched_util_est_se_tp(&p->se); } static inline int task_fits_capacity(struct task_struct *p, long capacity) @@ -7157,7 +7167,7 @@ static void yield_task_fair(struct rq *rq) set_skip_buddy(se); } -static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -8038,7 +8048,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } -static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) +static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long max = arch_scale_cpu_capacity(cpu); @@ -8070,7 +8080,7 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(sd, cpu); + unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); @@ -10016,7 +10026,12 @@ static void kick_ilb(unsigned int flags) { int ilb_cpu; - nohz.next_balance++; + /* + * Increase nohz.next_balance only when if full ilb is triggered but + * not if we only update stats. + */ + if (flags & NOHZ_BALANCE_KICK) + nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); @@ -10337,6 +10352,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, } } + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; + /* Newly idle CPU doesn't need an update */ if (idle != CPU_NEWLY_IDLE) { update_blocked_averages(this_cpu); @@ -10357,14 +10380,6 @@ abort: if (has_blocked_load) WRITE_ONCE(nohz.has_blocked, 1); - /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) - nohz.next_balance = next_balance; - return ret; } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 05deb81bb3e3..8d75ca201484 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -446,11 +446,6 @@ prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) BUG(); } -static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_idle(struct rq *rq) { } @@ -479,8 +474,6 @@ const struct sched_class idle_sched_class = { .task_tick = task_tick_idle, - .get_rr_interval = get_rr_interval_idle, - .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, .update_curr = update_curr_idle, diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 808244f3ddd9..5a6ea03f9882 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -140,7 +140,8 @@ static int __init housekeeping_nohz_full_setup(char *str) { unsigned int flags; - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; + flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | + HK_FLAG_MISC | HK_FLAG_KTHREAD; return housekeeping_setup(str, flags); } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index b4b1ff96642f..11bea3b08115 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -83,8 +83,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 @@ -264,7 +262,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa, static __always_inline void ___update_load_avg(struct sched_avg *sa, unsigned long load) { - u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib; + u32 divider = get_pelt_divider(sa); /* * Step 2: update *_avg. diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index eb034d9f024d..795e43e02afc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -37,6 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif +static inline u32 get_pelt_divider(struct sched_avg *avg) +{ + return LOAD_AVG_MAX - 1024 + avg->period_contrib; +} + /* * When a task is dequeued, its estimated utilization should not be update if * its util_avg has not been updated at least once. diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8f45cdb6463b..967732c0766c 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -190,7 +190,6 @@ static void group_init(struct psi_group *group) INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); mutex_init(&group->avgs_lock); /* Init trigger-related members */ - atomic_set(&group->poll_scheduled, 0); mutex_init(&group->trigger_lock); INIT_LIST_HEAD(&group->triggers); memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); @@ -199,7 +198,7 @@ static void group_init(struct psi_group *group) memset(group->polling_total, 0, sizeof(group->polling_total)); group->polling_next_update = ULLONG_MAX; group->polling_until = 0; - rcu_assign_pointer(group->poll_kworker, NULL); + rcu_assign_pointer(group->poll_task, NULL); } void __init psi_init(void) @@ -547,47 +546,38 @@ static u64 update_triggers(struct psi_group *group, u64 now) return now + group->poll_min_period; } -/* - * Schedule polling if it's not already scheduled. It's safe to call even from - * hotpath because even though kthread_queue_delayed_work takes worker->lock - * spinlock that spinlock is never contended due to poll_scheduled atomic - * preventing such competition. - */ +/* Schedule polling if it's not already scheduled. */ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) { - struct kthread_worker *kworker; + struct task_struct *task; - /* Do not reschedule if already scheduled */ - if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + /* + * Do not reschedule if already scheduled. + * Possible race with a timer scheduled after this check but before + * mod_timer below can be tolerated because group->polling_next_update + * will keep updates on schedule. + */ + if (timer_pending(&group->poll_timer)) return; rcu_read_lock(); - kworker = rcu_dereference(group->poll_kworker); + task = rcu_dereference(group->poll_task); /* * kworker might be NULL in case psi_trigger_destroy races with * psi_task_change (hotpath) which can't use locks */ - if (likely(kworker)) - kthread_queue_delayed_work(kworker, &group->poll_work, delay); - else - atomic_set(&group->poll_scheduled, 0); + if (likely(task)) + mod_timer(&group->poll_timer, jiffies + delay); rcu_read_unlock(); } -static void psi_poll_work(struct kthread_work *work) +static void psi_poll_work(struct psi_group *group) { - struct kthread_delayed_work *dwork; - struct psi_group *group; u32 changed_states; u64 now; - dwork = container_of(work, struct kthread_delayed_work, work); - group = container_of(dwork, struct psi_group, poll_work); - - atomic_set(&group->poll_scheduled, 0); - mutex_lock(&group->trigger_lock); now = sched_clock(); @@ -623,6 +613,32 @@ out: mutex_unlock(&group->trigger_lock); } +static int psi_poll_worker(void *data) +{ + struct psi_group *group = (struct psi_group *)data; + + sched_set_fifo_low(current); + + while (true) { + wait_event_interruptible(group->poll_wait, + atomic_cmpxchg(&group->poll_wakeup, 1, 0) || + kthread_should_stop()); + if (kthread_should_stop()) + break; + + psi_poll_work(group); + } + return 0; +} + +static void poll_timer_fn(struct timer_list *t) +{ + struct psi_group *group = from_timer(group, t, poll_timer); + + atomic_set(&group->poll_wakeup, 1); + wake_up_interruptible(&group->poll_wait); +} + static void record_times(struct psi_group_cpu *groupc, int cpu, bool memstall_tick) { @@ -1099,22 +1115,20 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, mutex_lock(&group->trigger_lock); - if (!rcu_access_pointer(group->poll_kworker)) { - struct sched_param param = { - .sched_priority = 1, - }; - struct kthread_worker *kworker; + if (!rcu_access_pointer(group->poll_task)) { + struct task_struct *task; - kworker = kthread_create_worker(0, "psimon"); - if (IS_ERR(kworker)) { + task = kthread_create(psi_poll_worker, group, "psimon"); + if (IS_ERR(task)) { kfree(t); mutex_unlock(&group->trigger_lock); - return ERR_CAST(kworker); + return ERR_CAST(task); } - sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); - kthread_init_delayed_work(&group->poll_work, - psi_poll_work); - rcu_assign_pointer(group->poll_kworker, kworker); + atomic_set(&group->poll_wakeup, 0); + init_waitqueue_head(&group->poll_wait); + wake_up_process(task); + timer_setup(&group->poll_timer, poll_timer_fn, 0); + rcu_assign_pointer(group->poll_task, task); } list_add(&t->node, &group->triggers); @@ -1132,7 +1146,7 @@ static void psi_trigger_destroy(struct kref *ref) { struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); struct psi_group *group = t->group; - struct kthread_worker *kworker_to_destroy = NULL; + struct task_struct *task_to_destroy = NULL; if (static_branch_likely(&psi_disabled)) return; @@ -1158,13 +1172,13 @@ static void psi_trigger_destroy(struct kref *ref) period = min(period, div_u64(tmp->win.size, UPDATES_PER_WINDOW)); group->poll_min_period = period; - /* Destroy poll_kworker when the last trigger is destroyed */ + /* Destroy poll_task when the last trigger is destroyed */ if (group->poll_states == 0) { group->polling_until = 0; - kworker_to_destroy = rcu_dereference_protected( - group->poll_kworker, + task_to_destroy = rcu_dereference_protected( + group->poll_task, lockdep_is_held(&group->trigger_lock)); - rcu_assign_pointer(group->poll_kworker, NULL); + rcu_assign_pointer(group->poll_task, NULL); } } @@ -1172,25 +1186,23 @@ static void psi_trigger_destroy(struct kref *ref) /* * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_kworker RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_kworker + * poll_task RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_task */ synchronize_rcu(); /* * Destroy the kworker after releasing trigger_lock to prevent a * deadlock while waiting for psi_poll_work to acquire trigger_lock */ - if (kworker_to_destroy) { + if (task_to_destroy) { /* * After the RCU grace period has expired, the worker - * can no longer be found through group->poll_kworker. + * can no longer be found through group->poll_task. * But it might have been already scheduled before * that - deschedule it cleanly before destroying it. */ - kthread_cancel_delayed_work_sync(&group->poll_work); - atomic_set(&group->poll_scheduled, 0); - - kthread_destroy_worker(kworker_to_destroy); + del_timer_sync(&group->poll_timer); + kthread_stop(task_to_destroy); } kfree(t); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d4e94c1e5fe..847f43fc9584 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -310,11 +310,26 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus) __dl_update(dl_b, -((s32)tsk_bw / cpus)); } -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap, + u64 old_bw, u64 new_bw) { return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; + cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw; +} + +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the CPU original capacity of the + * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the + * task and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned long cap = arch_scale_cpu_capacity(cpu); + + return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime; } extern void init_dl_bw(struct dl_bw *dl_b); @@ -1682,7 +1697,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_FORK 0x02 /* Child wakeup after fork */ #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ -#define WF_ON_RQ 0x08 /* Wakee is on_rq */ +#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1748,7 +1763,7 @@ struct sched_class { void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); - bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 4c9e9975684f..3e50a6a8f1e5 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -102,12 +102,6 @@ prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) BUG(); /* how!?, what priority? */ } -static unsigned int -get_rr_interval_stop(struct rq *rq, struct task_struct *task) -{ - return 0; -} - static void update_curr_stop(struct rq *rq) { } @@ -136,8 +130,6 @@ const struct sched_class stop_sched_class = { .task_tick = task_tick_stop, - .get_rr_interval = get_rr_interval_stop, - .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ba81187bb7af..9079d865a935 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1328,7 +1328,7 @@ sd_init(struct sched_domain_topology_level *tl, sd_flags = (*tl->sd_flags)(); if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, "wrong sd_flags in topology description\n")) - sd_flags &= ~TOPOLOGY_SD_FLAGS; + sd_flags &= TOPOLOGY_SD_FLAGS; /* Apply detected topology flags */ sd_flags |= dflags; diff --git a/kernel/smp.c b/kernel/smp.c index 472c2b274c65..aa17eedff5be 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,24 +669,6 @@ void __init smp_init(void) { int num_nodes, num_cpus; - /* - * Ensure struct irq_work layout matches so that - * flush_smp_call_function_queue() can do horrible things. - */ - BUILD_BUG_ON(offsetof(struct irq_work, llnode) != - offsetof(struct __call_single_data, llist)); - BUILD_BUG_ON(offsetof(struct irq_work, func) != - offsetof(struct __call_single_data, func)); - BUILD_BUG_ON(offsetof(struct irq_work, flags) != - offsetof(struct __call_single_data, flags)); - - /* - * Assert the CSD_TYPE_TTWU layout is similar enough - * for task_struct to be on the @call_single_queue. - */ - BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != - offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); - idle_threads_init(); cpuhp_threads_init(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index db1ce7af2563..4aea67d3d552 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1780,6 +1780,20 @@ static struct ctl_table kern_table[] = { .proc_handler = sched_rt_handler, }, { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "sched_rr_timeslice_ms", .data = &sysctl_sched_rr_timeslice, .maxlen = sizeof(int), diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1903b80db6eb..72064541bef2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2764,6 +2764,50 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } +/* List of trace_ops that have allocated trampolines */ +static LIST_HEAD(ftrace_ops_trampoline_list); + +static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_add_rcu(&ops->list, &ftrace_ops_trampoline_list); +} + +static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops) +{ + lockdep_assert_held(&ftrace_lock); + list_del_rcu(&ops->list); +} + +/* + * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols + * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is + * not a module. + */ +#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace" +#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline" + +static void ftrace_trampoline_free(struct ftrace_ops *ops) +{ + if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) && + ops->trampoline) { + /* + * Record the text poke event before the ksymbol unregister + * event. + */ + perf_event_text_poke((void *)ops->trampoline, + (void *)ops->trampoline, + ops->trampoline_size, NULL, 0); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, + true, FTRACE_TRAMPOLINE_SYM); + /* Remove from kallsyms after the perf events */ + ftrace_remove_trampoline_from_kallsyms(ops); + } + + arch_ftrace_trampoline_free(ops); +} + static void ftrace_startup_enable(int command) { if (saved_ftrace_func != ftrace_trace_function) { @@ -2934,7 +2978,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) synchronize_rcu_tasks(); free_ops: - arch_ftrace_trampoline_free(ops); + ftrace_trampoline_free(ops); } return 0; @@ -6178,6 +6222,27 @@ struct ftrace_mod_map { unsigned int num_funcs; }; +static int ftrace_get_trampoline_kallsym(unsigned int symnum, + unsigned long *value, char *type, + char *name, char *module_name, + int *exported) +{ + struct ftrace_ops *op; + + list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) { + if (!op->trampoline || symnum--) + continue; + *value = op->trampoline; + *type = 't'; + strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN); + strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN); + *exported = 0; + return 0; + } + + return -ERANGE; +} + #ifdef CONFIG_MODULES #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) @@ -6514,6 +6579,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, { struct ftrace_mod_map *mod_map; struct ftrace_mod_func *mod_func; + int ret; preempt_disable(); list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) { @@ -6540,8 +6606,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, WARN_ON(1); break; } + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); preempt_enable(); - return -ERANGE; + return ret; } #else @@ -6553,6 +6621,18 @@ allocate_ftrace_mod_map(struct module *mod, { return NULL; } +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, char *module_name, + int *exported) +{ + int ret; + + preempt_disable(); + ret = ftrace_get_trampoline_kallsym(symnum, value, type, name, + module_name, exported); + preempt_enable(); + return ret; +} #endif /* CONFIG_MODULES */ struct ftrace_init_func { @@ -6733,7 +6813,24 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) static void ftrace_update_trampoline(struct ftrace_ops *ops) { + unsigned long trampoline = ops->trampoline; + arch_ftrace_update_trampoline(ops); + if (ops->trampoline && ops->trampoline != trampoline && + (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) { + /* Add to kallsyms before the perf events */ + ftrace_add_trampoline_to_kallsyms(ops); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, + ops->trampoline, ops->trampoline_size, false, + FTRACE_TRAMPOLINE_SYM); + /* + * Record the perf text poke event after the ksymbol register + * event. + */ + perf_event_text_poke((void *)ops->trampoline, NULL, 0, + (void *)ops->trampoline, + ops->trampoline_size); + } } void ftrace_init_trace_array(struct trace_array *tr) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 81f5464ea9e1..34b84bcbd3d9 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -15,11 +15,15 @@ config CC_HAS_KASAN_GENERIC config CC_HAS_KASAN_SW_TAGS def_bool $(cc-option, -fsanitize=kernel-hwaddress) +config CC_HAS_WORKING_NOSANITIZE_ADDRESS + def_bool !CC_IS_GCC || GCC_VERSION >= 80300 + config KASAN bool "KASAN: runtime memory debugger" depends on (HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC) || \ (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS) depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB) + depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS help Enables KASAN (KernelAddressSANitizer) - runtime memory debugger, designed to find out-of-bounds accesses and use-after-free bugs. diff --git a/lib/math/div64.c b/lib/math/div64.c index 368ca7fd0d82..3952a07130d8 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -190,3 +190,44 @@ u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) return __iter_div_u64_rem(dividend, divisor, remainder); } EXPORT_SYMBOL(iter_div_u64_rem); + +#ifndef mul_u64_u64_div_u64 +u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) +{ + u64 res = 0, div, rem; + int shift; + + /* can a * b overflow ? */ + if (ilog2(a) + ilog2(b) > 62) { + /* + * (b * a) / c is equal to + * + * (b / c) * a + + * (b % c) * a / c + * + * if nothing overflows. Can the 1st multiplication + * overflow? Yes, but we do not care: this can only + * happen if the end result can't fit in u64 anyway. + * + * So the code below does + * + * res = (b / c) * a; + * b = b % c; + */ + div = div64_u64_rem(b, c, &rem); + res = div * a; + b = rem; + + shift = ilog2(a) + ilog2(b) - 62; + if (shift > 0) { + /* drop precision */ + b >>= shift; + c >>= shift; + if (!c) + return res; + } + } + + return res + div64_u64(a * b, c); +} +#endif diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h index eda15a5a285e..2e2ce089b0e9 100644 --- a/tools/objtool/arch.h +++ b/tools/objtool/arch.h @@ -82,6 +82,8 @@ bool arch_callee_saved_reg(unsigned char reg); unsigned long arch_jump_destination(struct instruction *insn); -unsigned long arch_dest_rela_offset(int addend); +unsigned long arch_dest_reloc_offset(int addend); + +const char *arch_nop_insn(int len); #endif /* _ARCH_H */ diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 4b504fc90bbb..1967370440b3 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -67,7 +67,7 @@ bool arch_callee_saved_reg(unsigned char reg) } } -unsigned long arch_dest_rela_offset(int addend) +unsigned long arch_dest_reloc_offset(int addend) { return addend + 4; } @@ -565,3 +565,21 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state) state->regs[16].base = CFI_CFA; state->regs[16].offset = -8; } + +const char *arch_nop_insn(int len) +{ + static const char nops[5][5] = { + /* 1 */ { 0x90 }, + /* 2 */ { 0x66, 0x90 }, + /* 3 */ { 0x0f, 0x1f, 0x00 }, + /* 4 */ { 0x0f, 0x1f, 0x40, 0x00 }, + /* 5 */ { 0x0f, 0x1f, 0x44, 0x00, 0x00 }, + }; + + if (len < 1 || len > 5) { + WARN("invalid NOP size: %d\n", len); + return NULL; + } + + return nops[len-1]; +} diff --git a/tools/objtool/arch/x86/include/arch_elf.h b/tools/objtool/arch/x86/include/arch_elf.h new file mode 100644 index 000000000000..69cc4264b28a --- /dev/null +++ b/tools/objtool/arch/x86/include/arch_elf.h @@ -0,0 +1,6 @@ +#ifndef _OBJTOOL_ARCH_ELF +#define _OBJTOOL_ARCH_ELF + +#define R_NONE R_X86_64_NONE + +#endif /* _OBJTOOL_ARCH_ELF */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 5fbb90a80d23..d8f898cdab5a 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -12,6 +12,7 @@ #include "check.h" #include "special.h" #include "warn.h" +#include "arch_elf.h" #include <linux/hashtable.h> #include <linux/kernel.h> @@ -352,7 +353,7 @@ static struct instruction *find_last_insn(struct objtool_file *file, static int add_dead_ends(struct objtool_file *file) { struct section *sec; - struct rela *rela; + struct reloc *reloc; struct instruction *insn; /* @@ -370,24 +371,24 @@ static int add_dead_ends(struct objtool_file *file) if (!sec) goto reachable; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (insn) insn = list_prev_entry(insn, list); - else if (rela->addend == rela->sym->sec->len) { - insn = find_last_insn(file, rela->sym->sec); + else if (reloc->addend == reloc->sym->sec->len) { + insn = find_last_insn(file, reloc->sym->sec); if (!insn) { WARN("can't find unreachable insn at %s+0x%x", - rela->sym->sec->name, rela->addend); + reloc->sym->sec->name, reloc->addend); return -1; } } else { WARN("can't find unreachable insn at %s+0x%x", - rela->sym->sec->name, rela->addend); + reloc->sym->sec->name, reloc->addend); return -1; } @@ -405,24 +406,24 @@ reachable: if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (insn) insn = list_prev_entry(insn, list); - else if (rela->addend == rela->sym->sec->len) { - insn = find_last_insn(file, rela->sym->sec); + else if (reloc->addend == reloc->sym->sec->len) { + insn = find_last_insn(file, reloc->sym->sec); if (!insn) { WARN("can't find reachable insn at %s+0x%x", - rela->sym->sec->name, rela->addend); + reloc->sym->sec->name, reloc->addend); return -1; } } else { WARN("can't find reachable insn at %s+0x%x", - rela->sym->sec->name, rela->addend); + reloc->sym->sec->name, reloc->addend); return -1; } @@ -440,26 +441,26 @@ static void add_ignores(struct objtool_file *file) struct instruction *insn; struct section *sec; struct symbol *func; - struct rela *rela; + struct reloc *reloc; sec = find_section_by_name(file->elf, ".rela.discard.func_stack_frame_non_standard"); if (!sec) return; - list_for_each_entry(rela, &sec->rela_list, list) { - switch (rela->sym->type) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + switch (reloc->sym->type) { case STT_FUNC: - func = rela->sym; + func = reloc->sym; break; case STT_SECTION: - func = find_func_by_offset(rela->sym->sec, rela->addend); + func = find_func_by_offset(reloc->sym->sec, reloc->addend); if (!func) continue; break; default: - WARN("unexpected relocation symbol type in %s: %d", sec->name, rela->sym->type); + WARN("unexpected relocation symbol type in %s: %d", sec->name, reloc->sym->type); continue; } @@ -579,20 +580,20 @@ static void add_uaccess_safe(struct objtool_file *file) static int add_ignore_alternatives(struct objtool_file *file) { struct section *sec; - struct rela *rela; + struct reloc *reloc; struct instruction *insn; sec = find_section_by_name(file->elf, ".rela.discard.ignore_alts"); if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("bad .discard.ignore_alts entry"); return -1; @@ -610,7 +611,7 @@ static int add_ignore_alternatives(struct objtool_file *file) static int add_jump_destinations(struct objtool_file *file) { struct instruction *insn; - struct rela *rela; + struct reloc *reloc; struct section *dest_sec; unsigned long dest_off; @@ -621,19 +622,19 @@ static int add_jump_destinations(struct objtool_file *file) if (insn->ignore || insn->offset == FAKE_JUMP_OFFSET) continue; - rela = find_rela_by_dest_range(file->elf, insn->sec, + reloc = find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len); - if (!rela) { + if (!reloc) { dest_sec = insn->sec; dest_off = arch_jump_destination(insn); - } else if (rela->sym->type == STT_SECTION) { - dest_sec = rela->sym->sec; - dest_off = arch_dest_rela_offset(rela->addend); - } else if (rela->sym->sec->idx) { - dest_sec = rela->sym->sec; - dest_off = rela->sym->sym.st_value + - arch_dest_rela_offset(rela->addend); - } else if (strstr(rela->sym->name, "_indirect_thunk_")) { + } else if (reloc->sym->type == STT_SECTION) { + dest_sec = reloc->sym->sec; + dest_off = arch_dest_reloc_offset(reloc->addend); + } else if (reloc->sym->sec->idx) { + dest_sec = reloc->sym->sec; + dest_off = reloc->sym->sym.st_value + + arch_dest_reloc_offset(reloc->addend); + } else if (strstr(reloc->sym->name, "_indirect_thunk_")) { /* * Retpoline jumps are really dynamic jumps in * disguise, so convert them accordingly. @@ -647,7 +648,7 @@ static int add_jump_destinations(struct objtool_file *file) continue; } else { /* external sibling call */ - insn->call_dest = rela->sym; + insn->call_dest = reloc->sym; continue; } @@ -723,15 +724,15 @@ static int add_call_destinations(struct objtool_file *file) { struct instruction *insn; unsigned long dest_off; - struct rela *rela; + struct reloc *reloc; for_each_insn(file, insn) { if (insn->type != INSN_CALL) continue; - rela = find_rela_by_dest_range(file->elf, insn->sec, + reloc = find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len); - if (!rela) { + if (!reloc) { dest_off = arch_jump_destination(insn); insn->call_dest = find_func_by_offset(insn->sec, dest_off); if (!insn->call_dest) @@ -751,19 +752,37 @@ static int add_call_destinations(struct objtool_file *file) return -1; } - } else if (rela->sym->type == STT_SECTION) { - dest_off = arch_dest_rela_offset(rela->addend); - insn->call_dest = find_func_by_offset(rela->sym->sec, + } else if (reloc->sym->type == STT_SECTION) { + dest_off = arch_dest_reloc_offset(reloc->addend); + insn->call_dest = find_func_by_offset(reloc->sym->sec, dest_off); if (!insn->call_dest) { WARN_FUNC("can't find call dest symbol at %s+0x%lx", insn->sec, insn->offset, - rela->sym->sec->name, + reloc->sym->sec->name, dest_off); return -1; } } else - insn->call_dest = rela->sym; + insn->call_dest = reloc->sym; + + /* + * Many compilers cannot disable KCOV with a function attribute + * so they need a little help, NOP out any KCOV calls from noinstr + * text. + */ + if (insn->sec->noinstr && + !strncmp(insn->call_dest->name, "__sanitizer_cov_", 16)) { + if (reloc) { + reloc->type = R_NONE; + elf_write_reloc(file->elf, reloc); + } + + elf_write_insn(file->elf, insn->sec, + insn->offset, insn->len, + arch_nop_insn(insn->len)); + insn->type = INSN_NOP; + } /* * Whatever stack impact regular CALLs have, should be undone @@ -871,7 +890,7 @@ static int handle_group_alt(struct objtool_file *file, */ if ((insn->offset != special_alt->new_off || (insn->type != INSN_CALL && !is_static_jump(insn))) && - find_rela_by_dest_range(file->elf, insn->sec, insn->offset, insn->len)) { + find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len)) { WARN_FUNC("unsupported relocation in alternatives section", insn->sec, insn->offset); @@ -1017,34 +1036,34 @@ out: } static int add_jump_table(struct objtool_file *file, struct instruction *insn, - struct rela *table) + struct reloc *table) { - struct rela *rela = table; + struct reloc *reloc = table; struct instruction *dest_insn; struct alternative *alt; struct symbol *pfunc = insn->func->pfunc; unsigned int prev_offset = 0; /* - * Each @rela is a switch table relocation which points to the target + * Each @reloc is a switch table relocation which points to the target * instruction. */ - list_for_each_entry_from(rela, &table->sec->rela_list, list) { + list_for_each_entry_from(reloc, &table->sec->reloc_list, list) { /* Check for the end of the table: */ - if (rela != table && rela->jump_table_start) + if (reloc != table && reloc->jump_table_start) break; /* Make sure the table entries are consecutive: */ - if (prev_offset && rela->offset != prev_offset + 8) + if (prev_offset && reloc->offset != prev_offset + 8) break; /* Detect function pointers from contiguous objects: */ - if (rela->sym->sec == pfunc->sec && - rela->addend == pfunc->offset) + if (reloc->sym->sec == pfunc->sec && + reloc->addend == pfunc->offset) break; - dest_insn = find_insn(file, rela->sym->sec, rela->addend); + dest_insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!dest_insn) break; @@ -1060,7 +1079,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, alt->insn = dest_insn; list_add_tail(&alt->list, &insn->alts); - prev_offset = rela->offset; + prev_offset = reloc->offset; } if (!prev_offset) { @@ -1115,11 +1134,11 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, * * NOTE: RETPOLINE made it harder still to decode dynamic jumps. */ -static struct rela *find_jump_table(struct objtool_file *file, +static struct reloc *find_jump_table(struct objtool_file *file, struct symbol *func, struct instruction *insn) { - struct rela *text_rela, *table_rela; + struct reloc *text_reloc, *table_reloc; struct instruction *dest_insn, *orig_insn = insn; struct section *table_sec; unsigned long table_offset; @@ -1144,16 +1163,16 @@ static struct rela *find_jump_table(struct objtool_file *file, break; /* look for a relocation which references .rodata */ - text_rela = find_rela_by_dest_range(file->elf, insn->sec, + text_reloc = find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len); - if (!text_rela || text_rela->sym->type != STT_SECTION || - !text_rela->sym->sec->rodata) + if (!text_reloc || text_reloc->sym->type != STT_SECTION || + !text_reloc->sym->sec->rodata) continue; - table_offset = text_rela->addend; - table_sec = text_rela->sym->sec; + table_offset = text_reloc->addend; + table_sec = text_reloc->sym->sec; - if (text_rela->type == R_X86_64_PC32) + if (text_reloc->type == R_X86_64_PC32) table_offset += 4; /* @@ -1170,14 +1189,14 @@ static struct rela *find_jump_table(struct objtool_file *file, continue; /* - * Each table entry has a rela associated with it. The rela + * Each table entry has a reloc associated with it. The reloc * should reference text in the same function as the original * instruction. */ - table_rela = find_rela_by_dest(file->elf, table_sec, table_offset); - if (!table_rela) + table_reloc = find_reloc_by_dest(file->elf, table_sec, table_offset); + if (!table_reloc) continue; - dest_insn = find_insn(file, table_rela->sym->sec, table_rela->addend); + dest_insn = find_insn(file, table_reloc->sym->sec, table_reloc->addend); if (!dest_insn || !dest_insn->func || dest_insn->func->pfunc != func) continue; @@ -1186,10 +1205,10 @@ static struct rela *find_jump_table(struct objtool_file *file, * indicates a rare GCC quirk/bug which can leave dead code * behind. */ - if (text_rela->type == R_X86_64_PC32) + if (text_reloc->type == R_X86_64_PC32) file->ignore_unreachables = true; - return table_rela; + return table_reloc; } return NULL; @@ -1203,7 +1222,7 @@ static void mark_func_jump_tables(struct objtool_file *file, struct symbol *func) { struct instruction *insn, *last = NULL; - struct rela *rela; + struct reloc *reloc; func_for_each_insn(file, func, insn) { if (!last) @@ -1226,10 +1245,10 @@ static void mark_func_jump_tables(struct objtool_file *file, if (insn->type != INSN_JUMP_DYNAMIC) continue; - rela = find_jump_table(file, func, insn); - if (rela) { - rela->jump_table_start = true; - insn->jump_table = rela; + reloc = find_jump_table(file, func, insn); + if (reloc) { + reloc->jump_table_start = true; + insn->jump_table = reloc; } } } @@ -1283,8 +1302,8 @@ static int add_jump_table_alts(struct objtool_file *file) static int read_unwind_hints(struct objtool_file *file) { - struct section *sec, *relasec; - struct rela *rela; + struct section *sec, *relocsec; + struct reloc *reloc; struct unwind_hint *hint; struct instruction *insn; struct cfi_reg *cfa; @@ -1294,8 +1313,8 @@ static int read_unwind_hints(struct objtool_file *file) if (!sec) return 0; - relasec = sec->rela; - if (!relasec) { + relocsec = sec->reloc; + if (!relocsec) { WARN("missing .rela.discard.unwind_hints section"); return -1; } @@ -1310,13 +1329,13 @@ static int read_unwind_hints(struct objtool_file *file) for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) { hint = (struct unwind_hint *)sec->data->d_buf + i; - rela = find_rela_by_dest(file->elf, sec, i * sizeof(*hint)); - if (!rela) { - WARN("can't find rela for unwind_hints[%d]", i); + reloc = find_reloc_by_dest(file->elf, sec, i * sizeof(*hint)); + if (!reloc) { + WARN("can't find reloc for unwind_hints[%d]", i); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("can't find insn for unwind_hints[%d]", i); return -1; @@ -1374,19 +1393,19 @@ static int read_retpoline_hints(struct objtool_file *file) { struct section *sec; struct instruction *insn; - struct rela *rela; + struct reloc *reloc; sec = find_section_by_name(file->elf, ".rela.discard.retpoline_safe"); if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("bad .discard.retpoline_safe entry"); return -1; @@ -1409,19 +1428,19 @@ static int read_instr_hints(struct objtool_file *file) { struct section *sec; struct instruction *insn; - struct rela *rela; + struct reloc *reloc; sec = find_section_by_name(file->elf, ".rela.discard.instr_end"); if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("bad .discard.instr_end entry"); return -1; @@ -1434,13 +1453,13 @@ static int read_instr_hints(struct objtool_file *file) if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { - if (rela->sym->type != STT_SECTION) { + list_for_each_entry(reloc, &sec->reloc_list, list) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("bad .discard.instr_begin entry"); return -1; @@ -1456,22 +1475,22 @@ static int read_intra_function_calls(struct objtool_file *file) { struct instruction *insn; struct section *sec; - struct rela *rela; + struct reloc *reloc; sec = find_section_by_name(file->elf, ".rela.discard.intra_function_calls"); if (!sec) return 0; - list_for_each_entry(rela, &sec->rela_list, list) { + list_for_each_entry(reloc, &sec->reloc_list, list) { unsigned long dest_off; - if (rela->sym->type != STT_SECTION) { + if (reloc->sym->type != STT_SECTION) { WARN("unexpected relocation symbol type in %s", sec->name); return -1; } - insn = find_insn(file, rela->sym->sec, rela->addend); + insn = find_insn(file, reloc->sym->sec, reloc->addend); if (!insn) { WARN("bad .discard.intra_function_call entry"); return -1; @@ -2190,10 +2209,36 @@ static inline const char *call_dest_name(struct instruction *insn) return "{dynamic}"; } +static inline bool noinstr_call_dest(struct symbol *func) +{ + /* + * We can't deal with indirect function calls at present; + * assume they're instrumented. + */ + if (!func) + return false; + + /* + * If the symbol is from a noinstr section; we good. + */ + if (func->sec->noinstr) + return true; + + /* + * The __ubsan_handle_*() calls are like WARN(), they only happen when + * something 'BAD' happened. At the risk of taking the machine down, + * let them proceed to get the message out. + */ + if (!strncmp(func->name, "__ubsan_handle_", 15)) + return true; + + return false; +} + static int validate_call(struct instruction *insn, struct insn_state *state) { if (state->noinstr && state->instr <= 0 && - (!insn->call_dest || !insn->call_dest->sec->noinstr)) { + !noinstr_call_dest(insn->call_dest)) { WARN_FUNC("call to %s() leaves .noinstr.text section", insn->sec, insn->offset, call_dest_name(insn)); return 1; @@ -2740,7 +2785,7 @@ int check(const char *_objname, bool orc) objname = _objname; - file.elf = elf_open_read(objname, orc ? O_RDWR : O_RDONLY); + file.elf = elf_open_read(objname, O_RDWR); if (!file.elf) return 1; @@ -2801,7 +2846,9 @@ int check(const char *_objname, bool orc) ret = create_orc_sections(&file); if (ret < 0) goto out; + } + if (file.elf->changed) { ret = elf_write(file.elf); if (ret < 0) goto out; diff --git a/tools/objtool/check.h b/tools/objtool/check.h index 906b5210f7ca..061aa96e15d3 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -37,7 +37,7 @@ struct instruction { struct symbol *call_dest; struct instruction *jump_dest; struct instruction *first_jump_src; - struct rela *jump_table; + struct reloc *jump_table; struct list_head alts; struct symbol *func; struct list_head stack_ops; diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 84225679f96d..3ddbd66f1a37 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -228,26 +228,26 @@ struct symbol *find_symbol_by_name(const struct elf *elf, const char *name) return NULL; } -struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec, +struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len) { - struct rela *rela, *r = NULL; + struct reloc *reloc, *r = NULL; unsigned long o; - if (!sec->rela) + if (!sec->reloc) return NULL; - sec = sec->rela; + sec = sec->reloc; for_offset_range(o, offset, offset + len) { - elf_hash_for_each_possible(elf->rela_hash, rela, hash, + elf_hash_for_each_possible(elf->reloc_hash, reloc, hash, sec_offset_hash(sec, o)) { - if (rela->sec != sec) + if (reloc->sec != sec) continue; - if (rela->offset >= offset && rela->offset < offset + len) { - if (!r || rela->offset < r->offset) - r = rela; + if (reloc->offset >= offset && reloc->offset < offset + len) { + if (!r || reloc->offset < r->offset) + r = reloc; } } if (r) @@ -257,9 +257,9 @@ struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec, return NULL; } -struct rela *find_rela_by_dest(const struct elf *elf, struct section *sec, unsigned long offset) +struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset) { - return find_rela_by_dest_range(elf, sec, offset, 1); + return find_reloc_by_dest_range(elf, sec, offset, 1); } static int read_sections(struct elf *elf) @@ -288,7 +288,7 @@ static int read_sections(struct elf *elf) memset(sec, 0, sizeof(*sec)); INIT_LIST_HEAD(&sec->symbol_list); - INIT_LIST_HEAD(&sec->rela_list); + INIT_LIST_HEAD(&sec->reloc_list); s = elf_getscn(elf->elf, i); if (!s) { @@ -434,7 +434,13 @@ static int read_symbols(struct elf *elf) size_t pnamelen; if (sym->type != STT_FUNC) continue; - sym->pfunc = sym->cfunc = sym; + + if (sym->pfunc == NULL) + sym->pfunc = sym; + + if (sym->cfunc == NULL) + sym->cfunc = sym; + coldstr = strstr(sym->name, ".cold"); if (!coldstr) continue; @@ -482,71 +488,101 @@ err: return -1; } -void elf_add_rela(struct elf *elf, struct rela *rela) +void elf_add_reloc(struct elf *elf, struct reloc *reloc) +{ + struct section *sec = reloc->sec; + + list_add_tail(&reloc->list, &sec->reloc_list); + elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc)); +} + +static int read_rel_reloc(struct section *sec, int i, struct reloc *reloc, unsigned int *symndx) { - struct section *sec = rela->sec; + if (!gelf_getrel(sec->data, i, &reloc->rel)) { + WARN_ELF("gelf_getrel"); + return -1; + } + reloc->type = GELF_R_TYPE(reloc->rel.r_info); + reloc->addend = 0; + reloc->offset = reloc->rel.r_offset; + *symndx = GELF_R_SYM(reloc->rel.r_info); + return 0; +} - list_add_tail(&rela->list, &sec->rela_list); - elf_hash_add(elf->rela_hash, &rela->hash, rela_hash(rela)); +static int read_rela_reloc(struct section *sec, int i, struct reloc *reloc, unsigned int *symndx) +{ + if (!gelf_getrela(sec->data, i, &reloc->rela)) { + WARN_ELF("gelf_getrela"); + return -1; + } + reloc->type = GELF_R_TYPE(reloc->rela.r_info); + reloc->addend = reloc->rela.r_addend; + reloc->offset = reloc->rela.r_offset; + *symndx = GELF_R_SYM(reloc->rela.r_info); + return 0; } -static int read_relas(struct elf *elf) +static int read_relocs(struct elf *elf) { struct section *sec; - struct rela *rela; + struct reloc *reloc; int i; unsigned int symndx; - unsigned long nr_rela, max_rela = 0, tot_rela = 0; + unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0; list_for_each_entry(sec, &elf->sections, list) { - if (sec->sh.sh_type != SHT_RELA) + if ((sec->sh.sh_type != SHT_RELA) && + (sec->sh.sh_type != SHT_REL)) continue; - sec->base = find_section_by_name(elf, sec->name + 5); + sec->base = find_section_by_index(elf, sec->sh.sh_info); if (!sec->base) { - WARN("can't find base section for rela section %s", + WARN("can't find base section for reloc section %s", sec->name); return -1; } - sec->base->rela = sec; + sec->base->reloc = sec; - nr_rela = 0; + nr_reloc = 0; for (i = 0; i < sec->sh.sh_size / sec->sh.sh_entsize; i++) { - rela = malloc(sizeof(*rela)); - if (!rela) { + reloc = malloc(sizeof(*reloc)); + if (!reloc) { perror("malloc"); return -1; } - memset(rela, 0, sizeof(*rela)); - - if (!gelf_getrela(sec->data, i, &rela->rela)) { - WARN_ELF("gelf_getrela"); - return -1; + memset(reloc, 0, sizeof(*reloc)); + switch (sec->sh.sh_type) { + case SHT_REL: + if (read_rel_reloc(sec, i, reloc, &symndx)) + return -1; + break; + case SHT_RELA: + if (read_rela_reloc(sec, i, reloc, &symndx)) + return -1; + break; + default: return -1; } - rela->type = GELF_R_TYPE(rela->rela.r_info); - rela->addend = rela->rela.r_addend; - rela->offset = rela->rela.r_offset; - symndx = GELF_R_SYM(rela->rela.r_info); - rela->sym = find_symbol_by_index(elf, symndx); - rela->sec = sec; - if (!rela->sym) { - WARN("can't find rela entry symbol %d for %s", + reloc->sec = sec; + reloc->idx = i; + reloc->sym = find_symbol_by_index(elf, symndx); + if (!reloc->sym) { + WARN("can't find reloc entry symbol %d for %s", symndx, sec->name); return -1; } - elf_add_rela(elf, rela); - nr_rela++; + elf_add_reloc(elf, reloc); + nr_reloc++; } - max_rela = max(max_rela, nr_rela); - tot_rela += nr_rela; + max_reloc = max(max_reloc, nr_reloc); + tot_reloc += nr_reloc; } if (stats) { - printf("max_rela: %lu\n", max_rela); - printf("tot_rela: %lu\n", tot_rela); + printf("max_reloc: %lu\n", max_reloc); + printf("tot_reloc: %lu\n", tot_reloc); } return 0; @@ -572,7 +608,7 @@ struct elf *elf_open_read(const char *name, int flags) elf_hash_init(elf->symbol_name_hash); elf_hash_init(elf->section_hash); elf_hash_init(elf->section_name_hash); - elf_hash_init(elf->rela_hash); + elf_hash_init(elf->reloc_hash); elf->fd = open(name, flags); if (elf->fd == -1) { @@ -605,7 +641,7 @@ struct elf *elf_open_read(const char *name, int flags) if (read_symbols(elf)) goto err; - if (read_relas(elf)) + if (read_relocs(elf)) goto err; return elf; @@ -631,7 +667,7 @@ struct section *elf_create_section(struct elf *elf, const char *name, memset(sec, 0, sizeof(*sec)); INIT_LIST_HEAD(&sec->symbol_list); - INIT_LIST_HEAD(&sec->rela_list); + INIT_LIST_HEAD(&sec->reloc_list); s = elf_newscn(elf->elf); if (!s) { @@ -713,28 +749,60 @@ struct section *elf_create_section(struct elf *elf, const char *name, elf_hash_add(elf->section_hash, &sec->hash, sec->idx); elf_hash_add(elf->section_name_hash, &sec->name_hash, str_hash(sec->name)); + elf->changed = true; + return sec; } -struct section *elf_create_rela_section(struct elf *elf, struct section *base) +static struct section *elf_create_rel_reloc_section(struct elf *elf, struct section *base) { - char *relaname; + char *relocname; struct section *sec; - relaname = malloc(strlen(base->name) + strlen(".rela") + 1); - if (!relaname) { + relocname = malloc(strlen(base->name) + strlen(".rel") + 1); + if (!relocname) { perror("malloc"); return NULL; } - strcpy(relaname, ".rela"); - strcat(relaname, base->name); + strcpy(relocname, ".rel"); + strcat(relocname, base->name); - sec = elf_create_section(elf, relaname, sizeof(GElf_Rela), 0); - free(relaname); + sec = elf_create_section(elf, relocname, sizeof(GElf_Rel), 0); + free(relocname); if (!sec) return NULL; - base->rela = sec; + base->reloc = sec; + sec->base = base; + + sec->sh.sh_type = SHT_REL; + sec->sh.sh_addralign = 8; + sec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx; + sec->sh.sh_info = base->idx; + sec->sh.sh_flags = SHF_INFO_LINK; + + return sec; +} + +static struct section *elf_create_rela_reloc_section(struct elf *elf, struct section *base) +{ + char *relocname; + struct section *sec; + + relocname = malloc(strlen(base->name) + strlen(".rela") + 1); + if (!relocname) { + perror("malloc"); + return NULL; + } + strcpy(relocname, ".rela"); + strcat(relocname, base->name); + + sec = elf_create_section(elf, relocname, sizeof(GElf_Rela), 0); + free(relocname); + if (!sec) + return NULL; + + base->reloc = sec; sec->base = base; sec->sh.sh_type = SHT_RELA; @@ -746,40 +814,143 @@ struct section *elf_create_rela_section(struct elf *elf, struct section *base) return sec; } -int elf_rebuild_rela_section(struct section *sec) +struct section *elf_create_reloc_section(struct elf *elf, + struct section *base, + int reltype) { - struct rela *rela; - int nr, idx = 0, size; - GElf_Rela *relas; + switch (reltype) { + case SHT_REL: return elf_create_rel_reloc_section(elf, base); + case SHT_RELA: return elf_create_rela_reloc_section(elf, base); + default: return NULL; + } +} - nr = 0; - list_for_each_entry(rela, &sec->rela_list, list) - nr++; +static int elf_rebuild_rel_reloc_section(struct section *sec, int nr) +{ + struct reloc *reloc; + int idx = 0, size; + GElf_Rel *relocs; + + /* Allocate a buffer for relocations */ + size = nr * sizeof(*relocs); + relocs = malloc(size); + if (!relocs) { + perror("malloc"); + return -1; + } + + sec->data->d_buf = relocs; + sec->data->d_size = size; + + sec->sh.sh_size = size; + + idx = 0; + list_for_each_entry(reloc, &sec->reloc_list, list) { + relocs[idx].r_offset = reloc->offset; + relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + idx++; + } + + return 0; +} - size = nr * sizeof(*relas); - relas = malloc(size); - if (!relas) { +static int elf_rebuild_rela_reloc_section(struct section *sec, int nr) +{ + struct reloc *reloc; + int idx = 0, size; + GElf_Rela *relocs; + + /* Allocate a buffer for relocations with addends */ + size = nr * sizeof(*relocs); + relocs = malloc(size); + if (!relocs) { perror("malloc"); return -1; } - sec->data->d_buf = relas; + sec->data->d_buf = relocs; sec->data->d_size = size; sec->sh.sh_size = size; idx = 0; - list_for_each_entry(rela, &sec->rela_list, list) { - relas[idx].r_offset = rela->offset; - relas[idx].r_addend = rela->addend; - relas[idx].r_info = GELF_R_INFO(rela->sym->idx, rela->type); + list_for_each_entry(reloc, &sec->reloc_list, list) { + relocs[idx].r_offset = reloc->offset; + relocs[idx].r_addend = reloc->addend; + relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); idx++; } return 0; } -int elf_write(const struct elf *elf) +int elf_rebuild_reloc_section(struct elf *elf, struct section *sec) +{ + struct reloc *reloc; + int nr; + + sec->changed = true; + elf->changed = true; + + nr = 0; + list_for_each_entry(reloc, &sec->reloc_list, list) + nr++; + + switch (sec->sh.sh_type) { + case SHT_REL: return elf_rebuild_rel_reloc_section(sec, nr); + case SHT_RELA: return elf_rebuild_rela_reloc_section(sec, nr); + default: return -1; + } +} + +int elf_write_insn(struct elf *elf, struct section *sec, + unsigned long offset, unsigned int len, + const char *insn) +{ + Elf_Data *data = sec->data; + + if (data->d_type != ELF_T_BYTE || data->d_off) { + WARN("write to unexpected data for section: %s", sec->name); + return -1; + } + + memcpy(data->d_buf + offset, insn, len); + elf_flagdata(data, ELF_C_SET, ELF_F_DIRTY); + + elf->changed = true; + + return 0; +} + +int elf_write_reloc(struct elf *elf, struct reloc *reloc) +{ + struct section *sec = reloc->sec; + + if (sec->sh.sh_type == SHT_REL) { + reloc->rel.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + reloc->rel.r_offset = reloc->offset; + + if (!gelf_update_rel(sec->data, reloc->idx, &reloc->rel)) { + WARN_ELF("gelf_update_rel"); + return -1; + } + } else { + reloc->rela.r_info = GELF_R_INFO(reloc->sym->idx, reloc->type); + reloc->rela.r_addend = reloc->addend; + reloc->rela.r_offset = reloc->offset; + + if (!gelf_update_rela(sec->data, reloc->idx, &reloc->rela)) { + WARN_ELF("gelf_update_rela"); + return -1; + } + } + + elf->changed = true; + + return 0; +} + +int elf_write(struct elf *elf) { struct section *sec; Elf_Scn *s; @@ -796,6 +967,8 @@ int elf_write(const struct elf *elf) WARN_ELF("gelf_update_shdr"); return -1; } + + sec->changed = false; } } @@ -808,6 +981,8 @@ int elf_write(const struct elf *elf) return -1; } + elf->changed = false; + return 0; } @@ -815,7 +990,7 @@ void elf_close(struct elf *elf) { struct section *sec, *tmpsec; struct symbol *sym, *tmpsym; - struct rela *rela, *tmprela; + struct reloc *reloc, *tmpreloc; if (elf->elf) elf_end(elf->elf); @@ -829,10 +1004,10 @@ void elf_close(struct elf *elf) hash_del(&sym->hash); free(sym); } - list_for_each_entry_safe(rela, tmprela, &sec->rela_list, list) { - list_del(&rela->list); - hash_del(&rela->hash); - free(rela); + list_for_each_entry_safe(reloc, tmpreloc, &sec->reloc_list, list) { + list_del(&reloc->list); + hash_del(&reloc->hash); + free(reloc); } list_del(&sec->list); free(sec); diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h index f4fe1d6ea392..6cc80a075166 100644 --- a/tools/objtool/elf.h +++ b/tools/objtool/elf.h @@ -32,8 +32,8 @@ struct section { GElf_Shdr sh; struct rb_root symbol_tree; struct list_head symbol_list; - struct list_head rela_list; - struct section *base, *rela; + struct list_head reloc_list; + struct section *base, *reloc; struct symbol *sym; Elf_Data *data; char *name; @@ -58,15 +58,19 @@ struct symbol { bool uaccess_safe; }; -struct rela { +struct reloc { struct list_head list; struct hlist_node hash; - GElf_Rela rela; + union { + GElf_Rela rela; + GElf_Rel rel; + }; struct section *sec; struct symbol *sym; - unsigned int type; unsigned long offset; + unsigned int type; int addend; + int idx; bool jump_table_start; }; @@ -76,13 +80,14 @@ struct elf { Elf *elf; GElf_Ehdr ehdr; int fd; + bool changed; char *name; struct list_head sections; DECLARE_HASHTABLE(symbol_hash, ELF_HASH_BITS); DECLARE_HASHTABLE(symbol_name_hash, ELF_HASH_BITS); DECLARE_HASHTABLE(section_hash, ELF_HASH_BITS); DECLARE_HASHTABLE(section_name_hash, ELF_HASH_BITS); - DECLARE_HASHTABLE(rela_hash, ELF_HASH_BITS); + DECLARE_HASHTABLE(reloc_hash, ELF_HASH_BITS); }; #define OFFSET_STRIDE_BITS 4 @@ -109,16 +114,20 @@ static inline u32 sec_offset_hash(struct section *sec, unsigned long offset) return ol; } -static inline u32 rela_hash(struct rela *rela) +static inline u32 reloc_hash(struct reloc *reloc) { - return sec_offset_hash(rela->sec, rela->offset); + return sec_offset_hash(reloc->sec, reloc->offset); } struct elf *elf_open_read(const char *name, int flags); struct section *elf_create_section(struct elf *elf, const char *name, size_t entsize, int nr); -struct section *elf_create_rela_section(struct elf *elf, struct section *base); -void elf_add_rela(struct elf *elf, struct rela *rela); -int elf_write(const struct elf *elf); +struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype); +void elf_add_reloc(struct elf *elf, struct reloc *reloc); +int elf_write_insn(struct elf *elf, struct section *sec, + unsigned long offset, unsigned int len, + const char *insn); +int elf_write_reloc(struct elf *elf, struct reloc *reloc); +int elf_write(struct elf *elf); void elf_close(struct elf *elf); struct section *find_section_by_name(const struct elf *elf, const char *name); @@ -126,11 +135,11 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset); struct symbol *find_symbol_by_name(const struct elf *elf, const char *name); struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset); -struct rela *find_rela_by_dest(const struct elf *elf, struct section *sec, unsigned long offset); -struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec, +struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset); +struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec, unsigned long offset, unsigned int len); struct symbol *find_func_containing(struct section *sec, unsigned long offset); -int elf_rebuild_rela_section(struct section *sec); +int elf_rebuild_reloc_section(struct elf *elf, struct section *sec); #define for_each_sec(file, sec) \ list_for_each_entry(sec, &file->elf->sections, list) diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c index c9549988121a..968f55e6dd94 100644 --- a/tools/objtool/orc_gen.c +++ b/tools/objtool/orc_gen.c @@ -80,56 +80,56 @@ int create_orc(struct objtool_file *file) return 0; } -static int create_orc_entry(struct elf *elf, struct section *u_sec, struct section *ip_relasec, +static int create_orc_entry(struct elf *elf, struct section *u_sec, struct section *ip_relocsec, unsigned int idx, struct section *insn_sec, unsigned long insn_off, struct orc_entry *o) { struct orc_entry *orc; - struct rela *rela; + struct reloc *reloc; /* populate ORC data */ orc = (struct orc_entry *)u_sec->data->d_buf + idx; memcpy(orc, o, sizeof(*orc)); - /* populate rela for ip */ - rela = malloc(sizeof(*rela)); - if (!rela) { + /* populate reloc for ip */ + reloc = malloc(sizeof(*reloc)); + if (!reloc) { perror("malloc"); return -1; } - memset(rela, 0, sizeof(*rela)); + memset(reloc, 0, sizeof(*reloc)); if (insn_sec->sym) { - rela->sym = insn_sec->sym; - rela->addend = insn_off; + reloc->sym = insn_sec->sym; + reloc->addend = insn_off; } else { /* * The Clang assembler doesn't produce section symbols, so we * have to reference the function symbol instead: */ - rela->sym = find_symbol_containing(insn_sec, insn_off); - if (!rela->sym) { + reloc->sym = find_symbol_containing(insn_sec, insn_off); + if (!reloc->sym) { /* * Hack alert. This happens when we need to reference * the NOP pad insn immediately after the function. */ - rela->sym = find_symbol_containing(insn_sec, + reloc->sym = find_symbol_containing(insn_sec, insn_off - 1); } - if (!rela->sym) { + if (!reloc->sym) { WARN("missing symbol for insn at offset 0x%lx\n", insn_off); return -1; } - rela->addend = insn_off - rela->sym->offset; + reloc->addend = insn_off - reloc->sym->offset; } - rela->type = R_X86_64_PC32; - rela->offset = idx * sizeof(int); - rela->sec = ip_relasec; + reloc->type = R_X86_64_PC32; + reloc->offset = idx * sizeof(int); + reloc->sec = ip_relocsec; - elf_add_rela(elf, rela); + elf_add_reloc(elf, reloc); return 0; } @@ -137,7 +137,7 @@ static int create_orc_entry(struct elf *elf, struct section *u_sec, struct secti int create_orc_sections(struct objtool_file *file) { struct instruction *insn, *prev_insn; - struct section *sec, *u_sec, *ip_relasec; + struct section *sec, *u_sec, *ip_relocsec; unsigned int idx; struct orc_entry empty = { @@ -181,8 +181,8 @@ int create_orc_sections(struct objtool_file *file) if (!sec) return -1; - ip_relasec = elf_create_rela_section(file->elf, sec); - if (!ip_relasec) + ip_relocsec = elf_create_reloc_section(file->elf, sec, SHT_RELA); + if (!ip_relocsec) return -1; /* create .orc_unwind section */ @@ -200,7 +200,7 @@ int create_orc_sections(struct objtool_file *file) if (!prev_insn || memcmp(&insn->orc, &prev_insn->orc, sizeof(struct orc_entry))) { - if (create_orc_entry(file->elf, u_sec, ip_relasec, idx, + if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx, insn->sec, insn->offset, &insn->orc)) return -1; @@ -212,7 +212,7 @@ int create_orc_sections(struct objtool_file *file) /* section terminator */ if (prev_insn) { - if (create_orc_entry(file->elf, u_sec, ip_relasec, idx, + if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx, prev_insn->sec, prev_insn->offset + prev_insn->len, &empty)) @@ -222,7 +222,7 @@ int create_orc_sections(struct objtool_file *file) } } - if (elf_rebuild_rela_section(ip_relasec)) + if (elf_rebuild_reloc_section(file->elf, ip_relocsec)) return -1; return 0; diff --git a/tools/objtool/special.c b/tools/objtool/special.c index e74e0189de22..e893f1e48e44 100644 --- a/tools/objtool/special.c +++ b/tools/objtool/special.c @@ -72,7 +72,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, struct section *sec, int idx, struct special_alt *alt) { - struct rela *orig_rela, *new_rela; + struct reloc *orig_reloc, *new_reloc; unsigned long offset; offset = idx * entry->size; @@ -118,30 +118,30 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry, } } - orig_rela = find_rela_by_dest(elf, sec, offset + entry->orig); - if (!orig_rela) { - WARN_FUNC("can't find orig rela", sec, offset + entry->orig); + orig_reloc = find_reloc_by_dest(elf, sec, offset + entry->orig); + if (!orig_reloc) { + WARN_FUNC("can't find orig reloc", sec, offset + entry->orig); return -1; } - if (orig_rela->sym->type != STT_SECTION) { - WARN_FUNC("don't know how to handle non-section rela symbol %s", - sec, offset + entry->orig, orig_rela->sym->name); + if (orig_reloc->sym->type != STT_SECTION) { + WARN_FUNC("don't know how to handle non-section reloc symbol %s", + sec, offset + entry->orig, orig_reloc->sym->name); return -1; } - alt->orig_sec = orig_rela->sym->sec; - alt->orig_off = orig_rela->addend; + alt->orig_sec = orig_reloc->sym->sec; + alt->orig_off = orig_reloc->addend; if (!entry->group || alt->new_len) { - new_rela = find_rela_by_dest(elf, sec, offset + entry->new); - if (!new_rela) { - WARN_FUNC("can't find new rela", + new_reloc = find_reloc_by_dest(elf, sec, offset + entry->new); + if (!new_reloc) { + WARN_FUNC("can't find new reloc", sec, offset + entry->new); return -1; } - alt->new_sec = new_rela->sym->sec; - alt->new_off = (unsigned int)new_rela->addend; + alt->new_sec = new_reloc->sym->sec; + alt->new_off = (unsigned int)new_reloc->addend; /* _ASM_EXTABLE_EX hack */ if (alt->new_off >= 0x7ffffff0) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 15a329da59fa..9a4349813a30 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -489,11 +489,27 @@ static void test_ptrace_write_gsbase(void) * selector value is changed or not by the GSBASE write in * a ptracer. */ - if (gs == 0 && base == 0xFF) { - printf("[OK]\tGS was reset as expected\n"); - } else { + if (gs != *shared_scratch) { nerrs++; - printf("[FAIL]\tGS=0x%lx, GSBASE=0x%lx (should be 0, 0xFF)\n", gs, base); + printf("[FAIL]\tGS changed to %lx\n", gs); + + /* + * On older kernels, poking a nonzero value into the + * base would zero the selector. On newer kernels, + * this behavior has changed -- poking the base + * changes only the base and, if FSGSBASE is not + * available, this may not effect. + */ + if (gs == 0) + printf("\tNote: this is expected behavior on older kernels.\n"); + } else if (have_fsgsbase && (base != 0xFF)) { + nerrs++; + printf("[FAIL]\tGSBASE changed to %lx\n", base); + } else { + printf("[OK]\tGS remained 0x%hx", *shared_scratch); + if (have_fsgsbase) + printf(" and GSBASE changed to 0xFF"); + printf("\n"); } } diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c index bc0ecc2e862e..62fba40866d5 100644 --- a/tools/testing/selftests/x86/syscall_arg_fault.c +++ b/tools/testing/selftests/x86/syscall_arg_fault.c @@ -72,6 +72,7 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void) if (ax != -EFAULT && ax != -ENOSYS) { printf("[FAIL]\tAX had the wrong value: 0x%lx\n", (unsigned long)ax); + printf("\tIP = 0x%lx\n", (unsigned long)ctx->uc_mcontext.gregs[REG_IP]); n_errs++; } else { printf("[OK]\tSeems okay\n"); @@ -226,5 +227,30 @@ int main() } set_eflags(get_eflags() & ~X86_EFLAGS_TF); +#ifdef __x86_64__ + printf("[RUN]\tSYSENTER with TF, invalid state, and GSBASE < 0\n"); + + if (sigsetjmp(jmpbuf, 1) == 0) { + sigtrap_consecutive_syscalls = 0; + + asm volatile ("wrgsbase %%rax\n\t" + :: "a" (0xffffffffffff0000UL)); + + set_eflags(get_eflags() | X86_EFLAGS_TF); + asm volatile ( + "movl $-1, %%eax\n\t" + "movl $-1, %%ebx\n\t" + "movl $-1, %%ecx\n\t" + "movl $-1, %%edx\n\t" + "movl $-1, %%esi\n\t" + "movl $-1, %%edi\n\t" + "movl $-1, %%ebp\n\t" + "movl $-1, %%esp\n\t" + "sysenter" + : : : "memory", "flags"); + } + set_eflags(get_eflags() & ~X86_EFLAGS_TF); +#endif + return 0; } |