From b332828c39326b1dca617f387dd15d12e81cd5f0 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:43:10 +0530 Subject: hw-breakpoints: prepare the code for Hardware Breakpoint interfaces The generic hardware breakpoint interface provides an abstraction of hardware breakpoints in front of specific arch implementations for both kernel and user side breakpoints. This includes execution breakpoints and read/write breakpoints, also known as "watchpoints". This patch introduces header files containing constants, structure definitions and declaration of functions used by the hardware breakpoint core and x86 specific code. It also introduces an array based storage for the debug-register values in 'struct thread_struct', while modifying all users of debugreg member in the structure. [ Impact: add headers for new hardware breakpoint interface ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/process.c | 16 ++++++++-------- arch/x86/kernel/ptrace.c | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index fb5dfb891f0f..291527cb438a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -106,10 +106,10 @@ void flush_thread(void) clear_tsk_thread_flag(tsk, TIF_DEBUG); - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; + tsk->thread.debugreg[0] = 0; + tsk->thread.debugreg[1] = 0; + tsk->thread.debugreg[2] = 0; + tsk->thread.debugreg[3] = 0; tsk->thread.debugreg6 = 0; tsk->thread.debugreg7 = 0; memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); @@ -194,10 +194,10 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, update_debugctlmsr(next->debugctlmsr); if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); + set_debugreg(next->debugreg[0], 0); + set_debugreg(next->debugreg[1], 1); + set_debugreg(next->debugreg[2], 2); + set_debugreg(next->debugreg[3], 3); /* no 4 and 5 */ set_debugreg(next->debugreg6, 6); set_debugreg(next->debugreg7, 7); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde91c13..313be40be55a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -471,10 +471,10 @@ static int genregs_set(struct task_struct *target, static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) { switch (n) { - case 0: return child->thread.debugreg0; - case 1: return child->thread.debugreg1; - case 2: return child->thread.debugreg2; - case 3: return child->thread.debugreg3; + case 0: return child->thread.debugreg[0]; + case 1: return child->thread.debugreg[1]; + case 2: return child->thread.debugreg[2]; + case 3: return child->thread.debugreg[3]; case 6: return child->thread.debugreg6; case 7: return child->thread.debugreg7; } @@ -493,10 +493,10 @@ static int ptrace_set_debugreg(struct task_struct *child, return -EIO; switch (n) { - case 0: child->thread.debugreg0 = data; break; - case 1: child->thread.debugreg1 = data; break; - case 2: child->thread.debugreg2 = data; break; - case 3: child->thread.debugreg3 = data; break; + case 0: child->thread.debugreg[0] = data; break; + case 1: child->thread.debugreg[1] = data; break; + case 2: child->thread.debugreg[2] = data; break; + case 3: child->thread.debugreg[3] = data; break; case 6: if ((data & ~0xffffffffUL) != 0) -- cgit v1.2.1 From 0067f1297241ea567f2b22a455519752d70fcca9 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:43:57 +0530 Subject: hw-breakpoints: x86 architecture implementation of Hardware Breakpoint interfaces This patch introduces the arch-specific implementation of the generic hardware breakpoints in kernel/hw_breakpoint.c inside x86 specific directories. It contains functions which help to validate and serve requests using Hardware Breakpoint registers on x86 processors. [ fweisbec@gmail.com: fix conflict against kmemcheck ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/hw_breakpoint.c | 382 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 383 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/hw_breakpoint.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 77df4d654ff9..cbc781829173 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,7 +36,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o -obj-y += alternative.o i8253.o pci-nommu.o +obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c new file mode 100644 index 000000000000..4867c9f3b5fb --- /dev/null +++ b/arch/x86/kernel/hw_breakpoint.c @@ -0,0 +1,382 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) 2009 IBM Corporation + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Unmasked kernel DR7 value */ +static unsigned long kdr7; + +/* + * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register. + * Used to clear and verify the status of bits corresponding to DR0 - DR3 + */ +static const unsigned long dr7_masks[HBP_NUM] = { + 0x000f0003, /* LEN0, R/W0, G0, L0 */ + 0x00f0000c, /* LEN1, R/W1, G1, L1 */ + 0x0f000030, /* LEN2, R/W2, G2, L2 */ + 0xf00000c0 /* LEN3, R/W3, G3, L3 */ +}; + + +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + unsigned long bp_info; + + bp_info = (len | type) & 0xf; + bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) | + DR_GLOBAL_SLOWDOWN; + return bp_info; +} + +void arch_update_kernel_hw_breakpoint(void *unused) +{ + struct hw_breakpoint *bp; + int i, cpu = get_cpu(); + unsigned long temp_kdr7 = 0; + + /* Don't allow debug exceptions while we update the registers */ + set_debugreg(0UL, 7); + + for (i = hbp_kernel_pos; i < HBP_NUM; i++) { + per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i]; + if (bp) { + temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type); + set_debugreg(bp->info.address, i); + } + } + + /* No need to set DR6. Update the debug registers with kernel-space + * breakpoint values from kdr7 and user-space requests from the + * current process + */ + kdr7 = temp_kdr7; + set_debugreg(kdr7 | current->thread.debugreg7, 7); + put_cpu_no_resched(); +} + +/* + * Install the thread breakpoints in their debug registers. + */ +void arch_install_thread_hw_breakpoint(struct task_struct *tsk) +{ + struct thread_struct *thread = &(tsk->thread); + + switch (hbp_kernel_pos) { + case 4: + set_debugreg(thread->debugreg[3], 3); + case 3: + set_debugreg(thread->debugreg[2], 2); + case 2: + set_debugreg(thread->debugreg[1], 1); + case 1: + set_debugreg(thread->debugreg[0], 0); + default: + break; + } + + /* No need to set DR6 */ + set_debugreg((kdr7 | thread->debugreg7), 7); +} + +/* + * Install the debug register values for just the kernel, no thread. + */ +void arch_uninstall_thread_hw_breakpoint() +{ + /* Clear the user-space portion of debugreg7 by setting only kdr7 */ + set_debugreg(kdr7, 7); + +} + +static int get_hbp_len(u8 hbp_len) +{ + unsigned int len_in_bytes = 0; + + switch (hbp_len) { + case HW_BREAKPOINT_LEN_1: + len_in_bytes = 1; + break; + case HW_BREAKPOINT_LEN_2: + len_in_bytes = 2; + break; + case HW_BREAKPOINT_LEN_4: + len_in_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + len_in_bytes = 8; + break; +#endif + } + return len_in_bytes; +} + +/* + * Check for virtual address in user space. + */ +int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va <= TASK_SIZE - len); +} + +/* + * Check for virtual address in kernel space. + */ +int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +{ + unsigned int len; + + len = get_hbp_len(hbp_len); + + return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); +} + +/* + * Store a breakpoint's encoded address, length, and type. + */ +static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk) +{ + /* + * User-space requests will always have the address field populated + * Symbol names from user-space are rejected + */ + if (tsk && bp->info.name) + return -EINVAL; + /* + * For kernel-addresses, either the address or symbol name can be + * specified. + */ + if (bp->info.name) + bp->info.address = (unsigned long) + kallsyms_lookup_name(bp->info.name); + if (bp->info.address) + return 0; + return -EINVAL; +} + +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, + struct task_struct *tsk) +{ + unsigned int align; + int ret = -EINVAL; + + switch (bp->info.type) { + /* + * Ptrace-refactoring code + * For now, we'll allow instruction breakpoint only for user-space + * addresses + */ + case HW_BREAKPOINT_EXECUTE: + if ((!arch_check_va_in_userspace(bp->info.address, + bp->info.len)) && + bp->info.len != HW_BREAKPOINT_LEN_EXECUTE) + return ret; + break; + case HW_BREAKPOINT_WRITE: + break; + case HW_BREAKPOINT_RW: + break; + default: + return ret; + } + + switch (bp->info.len) { + case HW_BREAKPOINT_LEN_1: + align = 0; + break; + case HW_BREAKPOINT_LEN_2: + align = 1; + break; + case HW_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + align = 7; + break; +#endif + default: + return ret; + } + + if (bp->triggered) + ret = arch_store_info(bp, tsk); + + if (ret < 0) + return ret; + /* + * Check that the low-order bits of the address are appropriate + * for the alignment implied by len. + */ + if (bp->info.address & align) + return -EINVAL; + + /* Check that the virtual address is in the proper range */ + if (tsk) { + if (!arch_check_va_in_userspace(bp->info.address, bp->info.len)) + return -EFAULT; + } else { + if (!arch_check_va_in_kernelspace(bp->info.address, + bp->info.len)) + return -EFAULT; + } + return 0; +} + +void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk) +{ + struct thread_struct *thread = &(tsk->thread); + struct hw_breakpoint *bp = thread->hbp[pos]; + + thread->debugreg7 &= ~dr7_masks[pos]; + if (bp) { + thread->debugreg[pos] = bp->info.address; + thread->debugreg7 |= encode_dr7(pos, bp->info.len, + bp->info.type); + } else + thread->debugreg[pos] = 0; +} + +void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *thread = &(tsk->thread); + + thread->debugreg7 = 0; + for (i = 0; i < HBP_NUM; i++) + thread->debugreg[i] = 0; +} + +/* + * Handle debug exception notifications. + * + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. + * + * NOTIFY_DONE returned if one of the following conditions is true. + * i) When the causative address is from user-space and the exception + * is a valid one, i.e. not triggered as a result of lazy debug register + * switching + * ii) When there are more bits than trap set in DR6 register (such + * as BD, BS or BT) indicating that more than one debug condition is + * met and requires some more action in do_debug(). + * + * NOTIFY_STOP returned for all other cases + * + */ +int __kprobes hw_breakpoint_handler(struct die_args *args) +{ + int i, cpu, rc = NOTIFY_STOP; + struct hw_breakpoint *bp; + /* The DR6 value is stored in args->err */ + unsigned long dr7, dr6 = args->err; + + /* Do an early return if no trap bits are set in DR6 */ + if ((dr6 & DR_TRAP_BITS) == 0) + return NOTIFY_DONE; + + /* Lazy debug register switching */ + if (!test_tsk_thread_flag(current, TIF_DEBUG)) + arch_uninstall_thread_hw_breakpoint(); + + get_debugreg(dr7, 7); + /* Disable breakpoints during exception handling */ + set_debugreg(0UL, 7); + /* + * Assert that local interrupts are disabled + * Reset the DRn bits in the virtualized register value. + * The ptrace trigger routine will add in whatever is needed. + */ + current->thread.debugreg6 &= ~DR_TRAP_BITS; + cpu = get_cpu(); + + /* Handle all the breakpoints that were triggered */ + for (i = 0; i < HBP_NUM; ++i) { + if (likely(!(dr6 & (DR_TRAP0 << i)))) + continue; + /* + * Find the corresponding hw_breakpoint structure and + * invoke its triggered callback. + */ + if (i >= hbp_kernel_pos) + bp = per_cpu(this_hbp_kernel[i], cpu); + else { + bp = current->thread.hbp[i]; + if (bp) + rc = NOTIFY_DONE; + } + /* + * bp can be NULL due to lazy debug register switching + * or due to the delay between updates of hbp_kernel_pos + * and this_hbp_kernel. + */ + if (!bp) + continue; + + (bp->triggered)(bp, args->regs); + } + if (dr6 & (~DR_TRAP_BITS)) + rc = NOTIFY_DONE; + + set_debugreg(dr7, 7); + put_cpu_no_resched(); + return rc; +} + +/* + * Handle debug exception notifications. + */ +int __kprobes hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + if (val != DIE_DEBUG) + return NOTIFY_DONE; + + return hw_breakpoint_handler(data); +} -- cgit v1.2.1 From 08d68323d1f0c34452e614263b212ca556dae47f Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:08 +0530 Subject: hw-breakpoints: modifying generic debug exception to use thread-specific debug registers This patch modifies the breakpoint exception handler code to use the new abstract debug register names. [ fweisbec@gmail.com: fix conflict against kmemcheck ] [ Impact: refactor and cleanup x86 debug exception handler ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/traps.c | 69 +++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff0..de9913247dd0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -529,73 +529,52 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned long condition; + unsigned long dr6; int si_code; - get_debugreg(condition, 6); + get_debugreg(dr6, 6); + /* DR6 may or may not be cleared by the CPU */ + set_debugreg(0, 6); /* * The processor cleared BTF, so don't mark that we need it set. */ clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); tsk->thread.debugctlmsr = 0; - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + /* Store the virtualized DR6 value */ + tsk->thread.debugreg6 = dr6; + + if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code, SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; + if (regs->flags & X86_VM_MASK) { + handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, 1); + return; } -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). + * Single-stepping through system calls: ignore any exceptions in + * kernel space, but re-enable TF when returning to user mode. + * + * We already checked v86 mode above, so we can check for kernel mode + * by just checking the CPL of CS. */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; + if ((dr6 & DR_STEP) && !user_mode(regs)) { + tsk->thread.debugreg6 &= ~DR_STEP; + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); + si_code = get_si_code(tsk->thread.debugreg6); + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) + send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); - return; -#ifdef CONFIG_X86_32 -debug_vm86: - /* reenable preemption: handle_vm86_trap() might sleep */ - dec_preempt_count(); - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); return; } -- cgit v1.2.1 From 1e3500666f7c5daaadadb8431a2927cdbbdb7dd4 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:26 +0530 Subject: hw-breakpoints: use wrapper routines around debug registers in processor related functions This patch enables the use of wrapper routines to access the debug/breakpoint registers on cpu management. The hardcoded debug registers save and restore operations for threads breakpoints are replaced by wrappers. And now that we handle the kernel breakpoints too, we also need to handle them on cpu hotplug operations. [ Impact: adapt new hardware breakpoint api to cpu hotplug ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/smpboot.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 58d24ef917d8..2b2652d205c0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -326,6 +327,7 @@ notrace static void __cpuinit start_secondary(void *unused) setup_secondary_clock(); wmb(); + load_debug_registers(); cpu_idle(); } @@ -1250,6 +1252,7 @@ void cpu_disable_common(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(); + hw_breakpoint_disable(); } int native_cpu_disable(void) -- cgit v1.2.1 From 66cb5917295958652ff6ba36d83f98f2379c46b4 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:44:55 +0530 Subject: hw-breakpoints: use the new wrapper routines to access debug registers in process/thread code This patch enables the use of abstract debug registers in process-handling routines, according to the new hardware breakpoint Api. [ Impact: adapt thread breakpoints handling code to the new breakpoint Api ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/process.c | 22 ++++++---------------- arch/x86/kernel/process_32.c | 28 ++++++++++++++++++++++++++++ arch/x86/kernel/process_64.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 291527cb438a..19a686c401b5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -46,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + flush_thread_hw_breakpoint(tsk); WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } @@ -106,12 +110,8 @@ void flush_thread(void) clear_tsk_thread_flag(tsk, TIF_DEBUG); - tsk->thread.debugreg[0] = 0; - tsk->thread.debugreg[1] = 0; - tsk->thread.debugreg[2] = 0; - tsk->thread.debugreg[3] = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + flush_thread_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. @@ -193,16 +193,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, else if (next->debugctlmsr != prev->debugctlmsr) update_debugctlmsr(next->debugctlmsr); - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg[0], 0); - set_debugreg(next->debugreg[1], 1); - set_debugreg(next->debugreg[2], 2); - set_debugreg(next->debugreg[3], 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { /* prev and next are different */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index b5e4bfef4472..297ffff2ffc2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -61,6 +61,8 @@ #include #include #include +#include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -265,7 +267,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, task_user_gs(p) = get_user_gs(regs); + p->thread.io_bitmap_ptr = NULL; tsk = current; + err = -ENOMEM; + if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) + if (copy_thread_hw_breakpoint(tsk, p, clone_flags)) + goto out; + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES, GFP_KERNEL); @@ -285,10 +293,13 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); +out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + if (err) + flush_thread_hw_breakpoint(p); clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); p->thread.ds_ctx = NULL; @@ -427,6 +438,23 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) lazy_load_gs(next->gs); percpu_write(current_task, next_p); + /* + * There's a problem with moving the arch_install_thread_hw_breakpoint() + * call before current is updated. Suppose a kernel breakpoint is + * triggered in between the two, the hw-breakpoint handler will see that + * the 'current' task does not have TIF_DEBUG flag set and will think it + * is leftover from an old task (lazy switching) and will erase it. Then + * until the next context switch, no user-breakpoints will be installed. + * + * The real problem is that it's impossible to update both current and + * physical debug registers at the same instant, so there will always be + * a window in which they disagree and a breakpoint might get triggered. + * Since we use lazy switching, we are forced to assume that a + * disagreement means that current is correct and the exception is due + * to lazy debug register switching. + */ + if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) + arch_install_thread_hw_breakpoint(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5a1a1de292ec..f7b276d4b3fb 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,6 +55,8 @@ #include #include #include +#include +#include asmlinkage extern void ret_from_fork(void); @@ -248,6 +250,8 @@ void release_thread(struct task_struct *dead_task) BUG(); } } + if (unlikely(dead_task->thread.debugreg7)) + flush_thread_hw_breakpoint(dead_task); } static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) @@ -303,12 +307,18 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; + p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); savesegment(fs, p->thread.fsindex); savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + err = -ENOMEM; + if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG))) + if (copy_thread_hw_breakpoint(me, p, clone_flags)) + goto out; + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -347,6 +357,9 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } + if (err) + flush_thread_hw_breakpoint(p); + return err; } @@ -492,6 +505,24 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (tsk_used_math(next_p) && next_p->fpu_counter > 5) math_state_restore(); + /* + * There's a problem with moving the arch_install_thread_hw_breakpoint() + * call before current is updated. Suppose a kernel breakpoint is + * triggered in between the two, the hw-breakpoint handler will see that + * the 'current' task does not have TIF_DEBUG flag set and will think it + * is leftover from an old task (lazy switching) and will erase it. Then + * until the next context switch, no user-breakpoints will be installed. + * + * The real problem is that it's impossible to update both current and + * physical debug registers at the same instant, so there will always be + * a window in which they disagree and a breakpoint might get triggered. + * Since we use lazy switching, we are forced to assume that a + * disagreement means that current is correct and the exception is due + * to lazy debug register switching. + */ + if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) + arch_install_thread_hw_breakpoint(next_p); + return prev_p; } -- cgit v1.2.1 From da0cdc14f5f7e0faee6b2393fefed056cdb17146 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:45:03 +0530 Subject: hw-breakpoints: modify signal handling code to refrain from re-enabling HW Breakpoints This patch disables re-enabling of Hardware Breakpoint registers through the signal handling code. This is now done during from hw_breakpoint_handler(). Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/signal.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e3..f33d2e0ef095 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -800,15 +800,6 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* - * Re-enable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) - set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* -- cgit v1.2.1 From 72f674d203cd230426437cdcf7dd6f681dad8b0d Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:45:48 +0530 Subject: hw-breakpoints: modify Ptrace routines to access breakpoint registers This patch modifies the ptrace code to use the new wrapper routines around the debug/breakpoint registers. [ Impact: adapt x86 ptrace to the new breakpoint Api ] Original-patch-by: Alan Stern Signed-off-by: K.Prasad Signed-off-by: Maneesh Soni Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ptrace.c | 231 +++++++++++++++++++++++++++++------------------ 1 file changed, 141 insertions(+), 90 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 313be40be55a..b457f78b7dbf 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -34,6 +34,7 @@ #include #include #include +#include #include @@ -136,11 +137,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ - return TASK_SIZE - 3; -} - #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) @@ -265,15 +261,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long debugreg_addr_limit(struct task_struct *task) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - return IA32_PAGE_OFFSET - 3; -#endif - return TASK_SIZE_MAX - 7; -} - #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -464,95 +451,159 @@ static int genregs_set(struct task_struct *target, } /* - * This function is trivial and will be inlined by the compiler. - * Having it separates the implementation details of debug - * registers from the interface details of ptrace. + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. */ -static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) +static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, + unsigned *type) { - switch (n) { - case 0: return child->thread.debugreg[0]; - case 1: return child->thread.debugreg[1]; - case 2: return child->thread.debugreg[2]; - case 3: return child->thread.debugreg[3]; - case 6: return child->thread.debugreg6; - case 7: return child->thread.debugreg7; - } - return 0; + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); + + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; } -static int ptrace_set_debugreg(struct task_struct *child, - int n, unsigned long data) +static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs) { + struct thread_struct *thread = &(current->thread); int i; - if (unlikely(n == 4 || n == 5)) - return -EIO; + /* + * Store in the virtual DR6 register the fact that the breakpoint + * was hit so the thread's debugger will see it. + */ + for (i = 0; i < hbp_kernel_pos; i++) + /* + * We will check bp->info.address against the address stored in + * thread's hbp structure and not debugreg[i]. This is to ensure + * that the corresponding bit for 'i' in DR7 register is enabled + */ + if (bp->info.address == thread->hbp[i]->info.address) + break; - if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) - return -EIO; + thread->debugreg6 |= (DR_TRAP0 << i); +} - switch (n) { - case 0: child->thread.debugreg[0] = data; break; - case 1: child->thread.debugreg[1] = data; break; - case 2: child->thread.debugreg[2] = data; break; - case 3: child->thread.debugreg[3] = data; break; +/* + * Handle ptrace writes to debug register 7. + */ +static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long old_dr7 = thread->debugreg7; + int i, orig_ret = 0, rc = 0; + int enabled, second_pass = 0; + unsigned len, type; + struct hw_breakpoint *bp; + + data &= ~DR_CONTROL_RESERVED; +restore: + /* + * Loop through all the hardware breakpoints, making the + * appropriate changes to each. + */ + for (i = 0; i < HBP_NUM; i++) { + enabled = decode_dr7(data, i, &len, &type); + bp = thread->hbp[i]; + + if (!enabled) { + if (bp) { + /* Don't unregister the breakpoints right-away, + * unless all register_user_hw_breakpoint() + * requests have succeeded. This prevents + * any window of opportunity for debug + * register grabbing by other users. + */ + if (!second_pass) + continue; + unregister_user_hw_breakpoint(tsk, bp); + kfree(bp); + } + continue; + } + if (!bp) { + rc = -ENOMEM; + bp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL); + if (bp) { + bp->info.address = thread->debugreg[i]; + bp->triggered = ptrace_triggered; + bp->info.len = len; + bp->info.type = type; + rc = register_user_hw_breakpoint(tsk, bp); + if (rc) + kfree(bp); + } + } else + rc = modify_user_hw_breakpoint(tsk, bp); + if (rc) + break; + } + /* + * Make a second pass to free the remaining unused breakpoints + * or to restore the original breakpoints if an error occurred. + */ + if (!second_pass) { + second_pass = 1; + if (rc < 0) { + orig_ret = rc; + data = old_dr7; + } + goto restore; + } + return ((orig_ret < 0) ? orig_ret : rc); +} - case 6: - if ((data & ~0xffffffffUL) != 0) - return -EIO; - child->thread.debugreg6 = data; - break; +/* + * Handle PTRACE_PEEKUSR calls for the debug register area. + */ +unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +{ + struct thread_struct *thread = &(tsk->thread); + unsigned long val = 0; + + if (n < HBP_NUM) + val = thread->debugreg[n]; + else if (n == 6) + val = thread->debugreg6; + else if (n == 7) + val = thread->debugreg7; + return val; +} - case 7: - /* - * Sanity-check data. Take one half-byte at once with - * check = (val >> (16 + 4*i)) & 0xf. It contains the - * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits - * 2 and 3 are LENi. Given a list of invalid values, - * we do mask |= 1 << invalid_value, so that - * (mask >> check) & 1 is a correct test for invalid - * values. - * - * R/Wi contains the type of the breakpoint / - * watchpoint, LENi contains the length of the watched - * data in the watchpoint case. - * - * The invalid values are: - * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit] - * - R/Wi == 0x10 (break on I/O reads or writes), so - * mask |= 0x4444. - * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= - * 0x1110. - * - * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. - * - * See the Intel Manual "System Programming Guide", - * 15.2.4 - * - * Note that LENi == 0x10 is defined on x86_64 in long - * mode (i.e. even for 32-bit userspace software, but - * 64-bit kernel), so the x86_64 mask value is 0x5454. - * See the AMD manual no. 24593 (AMD64 System Programming) - */ -#ifdef CONFIG_X86_32 -#define DR7_MASK 0x5f54 -#else -#define DR7_MASK 0x5554 -#endif - data &= ~DR_CONTROL_RESERVED; - for (i = 0; i < 4; i++) - if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) - return -EIO; - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - break; +/* + * Handle PTRACE_POKEUSR calls for the debug register area. + */ +int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +{ + struct thread_struct *thread = &(tsk->thread); + int rc = 0; + + /* There are no DR4 or DR5 registers */ + if (n == 4 || n == 5) + return -EIO; + + if (n == 6) { + tsk->thread.debugreg6 = val; + goto ret_path; } + if (n < HBP_NUM) { + if (thread->hbp[n]) { + if (arch_check_va_in_userspace(val, + thread->hbp[n]->info.len) == 0) { + rc = -EIO; + goto ret_path; + } + thread->hbp[n]->info.address = val; + } + thread->debugreg[n] = val; + } + /* All that's left is DR7 */ + if (n == 7) + rc = ptrace_write_dr7(tsk, val); - return 0; +ret_path: + return rc; } /* -- cgit v1.2.1 From 17f557e5b5d43a2af66c969f6560ac7105020672 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:46:03 +0530 Subject: hw-breakpoints: cleanup HW Breakpoint registers before kexec This patch disables Hardware breakpoints before doing a 'kexec' on the machine so that the cpu doesn't keep debug registers values which would be out of sync for the new image. Original-patch-by: Alan Stern Signed-off-by: K.Prasad Reviewed-by: Alan Stern Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/machine_kexec_32.c | 2 ++ arch/x86/kernel/machine_kexec_64.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index c1c429d00130..c843f8406da2 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -25,6 +25,7 @@ #include #include #include +#include static void set_idt(void *newidt, __u16 limit) { @@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 84c3bf209e98..4a8bb82248ae 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -18,6 +18,7 @@ #include #include #include +#include static int init_one_level2_page(struct kimage *image, pgd_t *pgd, unsigned long addr) @@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + hw_breakpoint_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC -- cgit v1.2.1 From 62edab9056a6cf0c9207339c8892c923a5217e45 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 1 Jun 2009 23:47:06 +0530 Subject: hw-breakpoints: reset bits in dr6 after the corresponding exception is handled This patch resets the bit in dr6 after the corresponding exception is handled in code, so that we keep a clean track of the current virtual debug status register. [ Impact: keep track of breakpoints triggering completion ] Signed-off-by: K.Prasad Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/hw_breakpoint.c | 13 +++++++++++-- arch/x86/kernel/kgdb.c | 6 ++++++ arch/x86/kernel/kprobes.c | 9 ++++++++- arch/x86/kernel/traps.c | 4 ++-- 4 files changed, 27 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 4867c9f3b5fb..69451473dbd2 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -314,8 +314,12 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct hw_breakpoint *bp; - /* The DR6 value is stored in args->err */ - unsigned long dr7, dr6 = args->err; + unsigned long dr7, dr6; + unsigned long *dr6_p; + + /* The DR6 value is pointed by args->err */ + dr6_p = (unsigned long *)ERR_PTR(args->err); + dr6 = *dr6_p; /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) @@ -351,6 +355,11 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) if (bp) rc = NOTIFY_DONE; } + /* + * Reset the 'i'th TRAP bit in dr6 to denote completion of + * exception handling + */ + (*dr6_p) &= ~(DR_TRAP0 << i); /* * bp can be NULL due to lazy debug register switching * or due to the delay between updates of hbp_kernel_pos diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b1f4dffb919e..f820b73c7f28 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -43,6 +43,7 @@ #include #include +#include #include #include @@ -434,6 +435,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args) "resuming...\n"); kgdb_arch_handle_exception(args->trapnr, args->signr, args->err, "c", "", regs); + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; return NOTIFY_STOP; } diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..b5b1848c5336 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -54,6 +54,7 @@ #include #include #include +#include void jprobe_return_end(void); @@ -967,8 +968,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_STOP; break; case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) + if (post_kprobe_handler(args->regs)) { + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; ret = NOTIFY_STOP; + } break; case DIE_GPF: /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index de9913247dd0..124a4d5a95b2 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -545,8 +545,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; - if (notify_die(DIE_DEBUG, "debug", regs, dr6, error_code, - SIGTRAP) == NOTIFY_STOP) + if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, + SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ -- cgit v1.2.1 From 4555835b707d5c778ee1c9076670bc99b1eeaf61 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 17 Jun 2009 14:44:19 +0530 Subject: x86: hw_breakpoint.c arch_check_va_in_kernelspace and hw_breakpoint_handler should be static arch_check_va_in_kernelspace() and hw_breakpoint_handler() is used only by same file so it should be static. Also fixed non-ANSI function declaration of function 'arch_uninstall_thread_hw_breakpoint' Fixed following sparse warnings : arch/x86/kernel/hw_breakpoint.c:124:42: warning: non-ANSI function declaration of function 'arch_uninstall_thread_hw_breakpoint' arch/x86/kernel/hw_breakpoint.c:169:5: warning: symbol 'arch_check_va_in_kernelspace' was not declared. Should it be static? arch/x86/kernel/hw_breakpoint.c:313:15: warning: symbol 'hw_breakpoint_handler' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput Cc: Alan Stern Cc: "K.Prasad" Cc: Frederic Weisbecker LKML-Reference: <1245230059.2662.4.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 51d959528b1d..9316a9de4de3 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -121,7 +121,7 @@ void arch_install_thread_hw_breakpoint(struct task_struct *tsk) /* * Install the debug register values for just the kernel, no thread. */ -void arch_uninstall_thread_hw_breakpoint() +void arch_uninstall_thread_hw_breakpoint(void) { /* Clear the user-space portion of debugreg7 by setting only kdr7 */ set_debugreg(kdr7, 7); @@ -166,7 +166,7 @@ int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) /* * Check for virtual address in kernel space. */ -int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) { unsigned int len; @@ -310,7 +310,7 @@ void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) * NOTIFY_STOP returned for all other cases * */ -int __kprobes hw_breakpoint_handler(struct die_args *args) +static int __kprobes hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; struct hw_breakpoint *bp; -- cgit v1.2.1 From 9d22b536609abf0d64648f99518676ea58245e3b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 1 Jul 2009 19:52:30 +0530 Subject: x86: Mark ptrace_get_debugreg() as static This sparse warning: arch/x86/kernel/ptrace.c:560:15: warning: symbol 'ptrace_get_debugreg' was not declared. Should it be static? triggers because ptrace_get_debugreg() is global but is only used in a single .c file. change ptrace_get_debugreg() to static to fix that - this also addresses the sparse warning. Signed-off-by: Jaswinder Singh Rajput Cc: Steven Rostedt LKML-Reference: <1246458150.6940.19.camel@hpdv5.satnam> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b457f78b7dbf..cabdabce3cb2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -557,7 +557,7 @@ restore: /* * Handle PTRACE_PEEKUSR calls for the debug register area. */ -unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { struct thread_struct *thread = &(tsk->thread); unsigned long val = 0; -- cgit v1.2.1 From 39fe05e58c5e448601ce46e6b03900d5bf31c4b0 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Aug 2009 11:16:12 +0800 Subject: x86, hpet: Disable per-cpu hpet timer if ARAT is supported If CPU support always running local APIC timer, per-cpu hpet timer could be disabled, which is useless and wasteful in such case. Let's leave the timers to others. The effect is that we reserve less timers. Signed-off-by: Shaohua Li Cc: venkatesh.pallipadi@intel.com LKML-Reference: <20090812031612.GA10062@sli10-desk.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index dedc2bddf7a5..5969e1078fc2 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -584,6 +584,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int num_timers_used = 0; int i; + if (boot_cpu_has(X86_FEATURE_ARAT)) + return; id = hpet_readl(HPET_ID); num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); @@ -872,10 +874,8 @@ int __init hpet_enable(void) if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(); - hpet_msi_capability_lookup(2); return 1; } - hpet_msi_capability_lookup(0); return 0; out_nohpet: @@ -908,9 +908,17 @@ static __init int hpet_late_init(void) if (!hpet_virt_address) return -ENODEV; + if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP) + hpet_msi_capability_lookup(2); + else + hpet_msi_capability_lookup(0); + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (boot_cpu_has(X86_FEATURE_ARAT)) + return 0; + for_each_online_cpu(cpu) { hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); } -- cgit v1.2.1 From 5946fa3d5cdeb846a647a1900026af9f8b08c8b5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 19 Aug 2009 08:44:24 +0100 Subject: x86, hpet: Simplify the HPET code On 64-bits, using unsigned long when unsigned int suffices needlessly creates larger code (due to the need for REX prefixes), and most of the logic in hpet.c really doesn't need 64-bit operations. At once this avoids the need for a couple of type casts. Signed-off-by: Jan Beulich Cc: Shaohua Li Cc: Venkatesh Pallipadi LKML-Reference: <4A8BC9780200007800010832@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 5969e1078fc2..ba575f0f2e34 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -47,12 +47,12 @@ struct hpet_dev { char name[10]; }; -unsigned long hpet_readl(unsigned long a) +inline unsigned int hpet_readl(unsigned int a) { return readl(hpet_virt_address + a); } -static inline void hpet_writel(unsigned long d, unsigned long a) +static inline void hpet_writel(unsigned int d, unsigned int a) { writel(d, hpet_virt_address + a); } @@ -167,7 +167,7 @@ do { \ static void hpet_reserve_msi_timers(struct hpet_data *hd); -static void hpet_reserve_platform_timers(unsigned long id) +static void hpet_reserve_platform_timers(unsigned int id) { struct hpet __iomem *hpet = hpet_virt_address; struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; @@ -205,7 +205,7 @@ static void hpet_reserve_platform_timers(unsigned long id) } #else -static void hpet_reserve_platform_timers(unsigned long id) { } +static void hpet_reserve_platform_timers(unsigned int id) { } #endif /* @@ -246,7 +246,7 @@ static void hpet_reset_counter(void) static void hpet_start_counter(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } @@ -271,7 +271,7 @@ static void hpet_resume_counter(void) static void hpet_enable_legacy_int(void) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); cfg |= HPET_CFG_LEGACY; hpet_writel(cfg, HPET_CFG); @@ -314,7 +314,7 @@ static int hpet_setup_msi_irq(unsigned int irq); static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { - unsigned long cfg, cmp, now; + unsigned int cfg, cmp, now; uint64_t delta; switch (mode) { @@ -323,7 +323,7 @@ static void hpet_set_mode(enum clock_event_mode mode, delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); - cmp = now + (unsigned long) delta; + cmp = now + (unsigned int) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); /* Make sure we use edge triggered interrupts */ cfg &= ~HPET_TN_LEVEL; @@ -339,7 +339,7 @@ static void hpet_set_mode(enum clock_event_mode mode, * (See AMD-8111 HyperTransport I/O Hub Data Sheet, * Publication # 24674) */ - hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); + hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer)); hpet_start_counter(); hpet_print_config(); break; @@ -387,9 +387,9 @@ static int hpet_next_event(unsigned long delta, * what we wrote hit the chip before we compare it to the * counter. */ - WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); + WARN_ON_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt); - return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; + return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } static void hpet_legacy_set_mode(enum clock_event_mode mode, @@ -415,7 +415,7 @@ static struct hpet_dev *hpet_devs; void hpet_msi_unmask(unsigned int irq) { struct hpet_dev *hdev = get_irq_data(irq); - unsigned long cfg; + unsigned int cfg; /* unmask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); @@ -425,7 +425,7 @@ void hpet_msi_unmask(unsigned int irq) void hpet_msi_mask(unsigned int irq) { - unsigned long cfg; + unsigned int cfg; struct hpet_dev *hdev = get_irq_data(irq); /* mask it */ @@ -600,7 +600,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { struct hpet_dev *hdev = &hpet_devs[num_timers_used]; - unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); + unsigned int cfg = hpet_readl(HPET_Tn_CFG(i)); /* Only consider HPET timer with MSI support */ if (!(cfg & HPET_TN_FSB_CAP)) @@ -815,7 +815,7 @@ static int hpet_clocksource_register(void) */ int __init hpet_enable(void) { - unsigned long id; + unsigned int id; int i; if (!is_hpet_capable()) @@ -933,7 +933,7 @@ fs_initcall(hpet_late_init); void hpet_disable(void) { if (is_hpet_capable()) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned int cfg = hpet_readl(HPET_CFG); if (hpet_legacy_int_enabled) { cfg &= ~HPET_CFG_LEGACY; @@ -973,8 +973,8 @@ static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; static u32 hpet_t1_cmp; -static unsigned long hpet_default_delta; -static unsigned long hpet_pie_delta; +static u32 hpet_default_delta; +static u32 hpet_pie_delta; static unsigned long hpet_pie_limit; static rtc_irq_handler irq_handler; @@ -1025,7 +1025,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); */ int hpet_rtc_timer_init(void) { - unsigned long cfg, cnt, delta, flags; + unsigned int cfg, cnt, delta; + unsigned long flags; if (!is_hpet_enabled()) return 0; @@ -1035,7 +1036,7 @@ int hpet_rtc_timer_init(void) clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; - hpet_default_delta = (unsigned long) clc; + hpet_default_delta = clc; } if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) @@ -1121,7 +1122,7 @@ int hpet_set_periodic_freq(unsigned long freq) clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; do_div(clc, freq); clc >>= hpet_clockevent.shift; - hpet_pie_delta = (unsigned long) clc; + hpet_pie_delta = clc; } return 1; } @@ -1135,7 +1136,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); static void hpet_rtc_timer_reinit(void) { - unsigned long cfg, delta; + unsigned int cfg, delta; int lost_ints = -1; if (unlikely(!hpet_rtc_flags)) { -- cgit v1.2.1 From b46b3d70c9c017d7c4ec49f7f3ffd0af5a622277 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:28 -0400 Subject: kprobes: Checks probe address is instruction boudary on x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ensure safeness of inserting kprobes by checking whether the specified address is at the first byte of an instruction on x86. This is done by decoding probed function from its head to the probe point. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Przemysław Pawełczyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203428.31965.21939.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..aa15f3e1f64b 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -48,12 +48,14 @@ #include #include #include +#include #include #include #include #include #include +#include void jprobe_return_end(void); @@ -244,6 +246,75 @@ retry: } } +/* Recover the probed instruction at addr for further analysis. */ +static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + kp = get_kprobe((void *)addr); + if (!kp) + return -EINVAL; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, fix_riprel() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, kp->opcode has a copy of the first byte of + * the probed instruction, which is overwritten by int3. And + * the instruction at kp->addr is not modified by kprobes except + * for the first byte, we can recover the original instruction + * from it and kp->opcode. + */ + memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + buf[0] = kp->opcode; + return 0; +} + +/* Dummy buffers for kallsyms_lookup */ +static char __dummy_buf[KSYM_NAME_LEN]; + +/* Check if paddr is at an instruction boundary */ +static int __kprobes can_probe(unsigned long paddr) +{ + int ret; + unsigned long addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + + /* + * Check if the instruction has been modified by another + * kprobe, in which case we replace the breakpoint by the + * original instruction in our buffer. + */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + /* + * Another debugging subsystem might insert + * this breakpoint. In that case, we can't + * recover it. + */ + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + addr += insn.length; + } + + return (addr == paddr); +} + /* * Returns non-zero if opcode modifies the interrupt flag. */ @@ -359,6 +430,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) -- cgit v1.2.1 From 89ae465b0ee470f7d3f8a1c61353445c3acbbe2a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:36 -0400 Subject: kprobes: Cleanup fix_riprel() using insn decoder on x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup fix_riprel() in arch/x86/kernel/kprobes.c by using the new x86 instruction decoder instead of using comparisons with raw ad hoc numeric opcodes. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Przemysław Pawełczyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203436.31965.34374.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 128 +++++++++------------------------------------- 1 file changed, 23 insertions(+), 105 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index aa15f3e1f64b..16ae9610f6ff 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -108,50 +108,6 @@ static const u32 twobyte_is_boostable[256 / 32] = { /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -static const u32 onebyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ - W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ - W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ - W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ - W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ - W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ - W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ - W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; -static const u32 twobyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ - W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ - W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ - W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ - W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ - W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ - W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; #undef W struct kretprobe_blackpoint kretprobe_blacklist[] = { @@ -348,68 +304,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) static void __kprobes fix_riprel(struct kprobe *p) { #ifdef CONFIG_X86_64 - u8 *insn = p->ainsn.insn; - s64 disp; - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } + struct insn insn; + kernel_insn_init(&insn, p->ainsn.insn); - /* Skip REX instruction prefix. */ - if (is_REX_prefix(insn)) - ++insn; - - if (*insn == 0x0f) { - /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, - (unsigned long *)twobyte_has_modrm); - } else - /* One-byte opcode. */ - need_modrm = test_bit(*insn, - (unsigned long *)onebyte_has_modrm); - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { - /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - ++insn; - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - disp = (u8 *) p->addr + *((s32 *) insn) - - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *(s32 *)insn = (s32) disp; - } + if (insn_rip_relative(&insn)) { + s64 newdisp; + u8 *disp; + insn_get_displacement(&insn); + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) p->addr + (s64) insn.displacement.value - + (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ + disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + *(s32 *) disp = (s32) newdisp; } #endif } -- cgit v1.2.1 From b1cf540f0e5278ecfe8532557e547d833ed269d7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 13 Aug 2009 16:34:44 -0400 Subject: x86: Add pt_regs register and stack access APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add following APIs for accessing registers and stack entries from pt_regs. These APIs are required by kprobes-based event tracer on ftrace. Some other debugging tools might be able to use it too. - regs_query_register_offset(const char *name) Query the offset of "name" register. - regs_query_register_name(unsigned int offset) Query the name of register by its offset. - regs_get_register(struct pt_regs *regs, unsigned int offset) Get the value of a register by its offset. - regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) Check the address is in the kernel stack. - regs_get_kernel_stack_nth(struct pt_regs *reg, unsigned int nth) Get Nth entry of the kernel stack. (N >= 0) - regs_get_argument_nth(struct pt_regs *reg, unsigned int nth) Get Nth argument at function call. (N >= 0) Signed-off-by: Masami Hiramatsu Cc: linux-arch@vger.kernel.org Cc: Ananth N Mavinakayanahalli Cc: Avi Kivity Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: Jim Keniston Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Przemysław Pawełczyk Cc: Roland McGrath Cc: Sam Ravnborg Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi Cc: Vegard Nossum LKML-Reference: <20090813203444.31965.26374.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ptrace.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8d7d5c9c1be3..a33a17d5d5c8 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -49,6 +49,118 @@ enum x86_regset { REGSET_IOPERM32, }; +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +static const int arg_offs_table[] = { +#ifdef CONFIG_X86_32 + [0] = offsetof(struct pt_regs, ax), + [1] = offsetof(struct pt_regs, dx), + [2] = offsetof(struct pt_regs, cx) +#else /* CONFIG_X86_64 */ + [0] = offsetof(struct pt_regs, di), + [1] = offsetof(struct pt_regs, si), + [2] = offsetof(struct pt_regs, dx), + [3] = offsetof(struct pt_regs, cx), + [4] = offsetof(struct pt_regs, r8), + [5] = offsetof(struct pt_regs, r9) +#endif +}; + +/** + * regs_get_argument_nth() - get Nth argument at function call + * @regs: pt_regs which contains registers at function entry. + * @n: argument number. + * + * regs_get_argument_nth() returns @n th argument of a function call. + * Since usually the kernel stack will be changed right after function entry, + * you must use this at function entry. If the @n th entry is NOT in the + * kernel stack or pt_regs, this returns 0. + */ +unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) +{ + if (n < ARRAY_SIZE(arg_offs_table)) + return *((unsigned long *)regs + arg_offs_table[n]); + else { + /* + * The typical case: arg n is on the stack. + * (Note: stack[0] = return address, so skip it) + */ + n -= ARRAY_SIZE(arg_offs_table); + return regs_get_kernel_stack_nth(regs, 1 + n); + } +} + /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. -- cgit v1.2.1 From 24851d2447830e6cba4c4b641cb73e713f312373 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 26 Aug 2009 23:38:30 +0200 Subject: tracing/kprobes: Dump the culprit kprobe in case of kprobe recursion Kprobes can enter into a probing recursion, ie: a kprobe that does an endless loop because one of its core mechanism function used during probing is also probed itself. This patch helps pinpointing the kprobe that raised such recursion by dumping it and raising a BUG instead of a warning (we also disarm the kprobe to try avoiding recursion in BUG itself). Having a BUG instead of a warning stops the stacktrace in the right place and doesn't pollute the logs with hundreds of traces that eventually end up in a stack overflow. Signed-off-by: Frederic Weisbecker Cc: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli --- arch/x86/kernel/kprobes.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 16ae9610f6ff..ecee3d23fef8 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -490,9 +490,13 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, /* A probe has been hit in the codepath leading up * to, or just after, single-stepping of a probed * instruction. This entire codepath should strictly - * reside in .kprobes.text section. Raise a warning - * to highlight this peculiar case. + * reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless + * reentering loop and eventually a stack overflow. */ + arch_disarm_kprobe(p); + dump_kprobe(p); + BUG(); } default: /* impossible cases */ -- cgit v1.2.1 From c8bc6f3c806f1fcbfdbf0b1ff6c52dba59192d3b Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 4 Aug 2009 12:07:09 -0700 Subject: x86: arch specific support for remapping HPET MSIs x86 arch support for remapping HPET MSI's by associating the HPET timer block with the interrupt-remapping HW unit and setting up appropriate irq_chip Signed-off-by: Suresh Siddha Cc: Venkatesh Pallipadi Cc: David Woodhouse Cc: Jesse Barnes Cc: Jay Fenlason LKML-Reference: <20090804190729.630510000@intel.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 1 + arch/x86/kernel/apic/io_apic.c | 49 +++++++++++++++++++++++++++++++++++------- arch/x86/kernel/hpet.c | 3 ++- 3 files changed, 44 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6b8ca3a0285d..eae642b0f345 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) } hpet_address = hpet_tbl->address.address; + hpet_blockid = hpet_tbl->sequence; /* * Some broken BIOSes advertise HPET at 0x0. We really do not diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d2ed6c5ddc80..d9c6f14d3b32 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3254,7 +3254,8 @@ void destroy_irq(unsigned int irq) * MSI message composition */ #ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + struct msi_msg *msg, u8 hpet_id) { struct irq_cfg *cfg; int err; @@ -3288,7 +3289,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms irte.dest_id = IRTE_DEST(dest); /* Set source-id of interrupt request */ - set_msi_sid(&irte, pdev); + if (pdev) + set_msi_sid(&irte, pdev); + else + set_hpet_sid(&irte, hpet_id); modify_irte(irq, &irte); @@ -3453,7 +3457,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) int ret; struct msi_msg msg; - ret = msi_compose_msg(dev, irq, &msg); + ret = msi_compose_msg(dev, irq, &msg, -1); if (ret < 0) return ret; @@ -3586,7 +3590,7 @@ int arch_setup_dmar_msi(unsigned int irq) int ret; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); + ret = msi_compose_msg(NULL, irq, &msg, -1); if (ret < 0) return ret; dmar_msi_write(irq, &msg); @@ -3626,6 +3630,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) #endif /* CONFIG_SMP */ +static struct irq_chip ir_hpet_msi_type = { + .name = "IR-HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, +#ifdef CONFIG_INTR_REMAP + .ack = ir_ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif +#endif + .retrigger = ioapic_retrigger_irq, +}; + static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .unmask = hpet_msi_unmask, @@ -3637,20 +3654,36 @@ static struct irq_chip hpet_msi_type = { .retrigger = ioapic_retrigger_irq, }; -int arch_setup_hpet_msi(unsigned int irq) +int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { int ret; struct msi_msg msg; struct irq_desc *desc = irq_to_desc(irq); - ret = msi_compose_msg(NULL, irq, &msg); + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_hpet_to_ir(id); + int index; + + if (!iommu) + return -1; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + return -1; + } + + ret = msi_compose_msg(NULL, irq, &msg, id); if (ret < 0) return ret; hpet_msi_write(irq, &msg); desc->status |= IRQ_MOVE_PCNTXT; - set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, - "edge"); + if (irq_remapped(irq)) + set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, + handle_edge_irq, "edge"); + else + set_irq_chip_and_handler_name(irq, &hpet_msi_type, + handle_edge_irq, "edge"); return 0; } diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba575f0f2e34..7f024ff47d1d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -33,6 +33,7 @@ * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; +u8 hpet_blockid; /* OS timer block num */ #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -467,7 +468,7 @@ static int hpet_msi_next_event(unsigned long delta, static int hpet_setup_msi_irq(unsigned int irq) { - if (arch_setup_hpet_msi(irq)) { + if (arch_setup_hpet_msi(irq, hpet_blockid)) { destroy_irq(irq); return -EINVAL; } -- cgit v1.2.1 From e9afe9e1b3fdbd56cca53959a2519e70db9c8095 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 27 Aug 2009 13:22:58 -0400 Subject: kprobes/x86: Call BUG() when reentering probe into KPROBES_HIT_SS Call BUG() when a probe have been hit on the way of kprobe processing path, because that kind of probes are currently unrecoverable (recovering it will cause an infinite loop and stack overflow). The original code seems to assume that it's caused by an int3 which another subsystem inserted on out-of-line singlestep buffer if the hitting probe is same as current probe. However, in that case, int3-hitting-address is on the out-of-line buffer and should be different from first (current) int3 address. Thus, I decided to remove the code. I also removes arch_disarm_kprobe() because it will involve other stuffs in text_poke(). Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ingo Molnar LKML-Reference: <20090827172258.8246.61889.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index ecee3d23fef8..e0fb615ba1e9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -482,22 +482,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, kcb->kprobe_status = KPROBE_REENTER; break; case KPROBE_HIT_SS: - if (p == kprobe_running()) { - regs->flags &= ~X86_EFLAGS_TF; - regs->flags |= kcb->kprobe_saved_flags; - return 0; - } else { - /* A probe has been hit in the codepath leading up - * to, or just after, single-stepping of a probed - * instruction. This entire codepath should strictly - * reside in .kprobes.text section. - * Raise a BUG or we'll continue in an endless - * reentering loop and eventually a stack overflow. - */ - arch_disarm_kprobe(p); - dump_kprobe(p); - BUG(); - } + /* A probe has been hit in the codepath leading up to, or just + * after, single-stepping of a probed instruction. This entire + * codepath should strictly reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless reentering loop + * and eventually a stack overflow. + */ + printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", + p->addr); + dump_kprobe(p); + BUG(); default: /* impossible cases */ WARN_ON(1); -- cgit v1.2.1 From f5ad31158d60946b9fd18c8a79c283a6bc432430 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 27 Aug 2009 13:23:04 -0400 Subject: kprobes/x86-64: Allow to reenter probe on post_handler Allow to reenter probe on the post_handler of another probe on x86-64, because x86-64 already allows reentering int3. In that case, reentered probe just increases kp.nmissed and returns. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ingo Molnar LKML-Reference: <20090827172304.8246.4822.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/kprobes.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e0fb615ba1e9..c5f1f117e0c0 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -463,17 +463,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, { switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: -#ifdef CONFIG_X86_64 - /* TODO: Provide re-entrancy from post_kprobes_handler() and - * avoid exception stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->ip = (unsigned long)p->addr; - reset_current_kprobe(); - preempt_enable_no_resched(); - break; -#endif case KPROBE_HIT_ACTIVE: save_previous_kprobe(kcb); set_current_kprobe(p, regs, kcb); -- cgit v1.2.1 From 8222d718b3ad3ae49c48f69ae4b6a1128c9a92cf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 27 Aug 2009 13:23:25 -0400 Subject: kprobes/x86-64: Fix to move common_interrupt to .kprobes.text Since nmi, debug and int3 returns to irq_return inside common_interrupt, probing this function will cause int3-loop, so it should be marked as __kprobes. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Ingo Molnar LKML-Reference: <20090827172325.8246.40000.stgit@localhost.localdomain> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/entry_64.S | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c251be745107..36e2ef5cc83f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -809,6 +809,10 @@ END(interrupt) call \func .endm +/* + * Interrupt entry/exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" /* * The interrupt stubs push (~vector+0x80) onto the stack and * then jump to common_interrupt. @@ -947,6 +951,10 @@ ENTRY(retint_kernel) CFI_ENDPROC END(common_interrupt) +/* + * End of kprobes section + */ + .popsection /* * APIC interrupts. -- cgit v1.2.1 From a00e817f42663941ea0aa5f85a9d1c4f8b212839 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Sep 2009 12:47:55 -0400 Subject: kprobes/x86-32: Move irq-exit functions to kprobes section Move irq-exit functions to .kprobes.text section to protect against kprobes recursion. When I ran kprobe stress test on x86-32, I found below symbols cause unrecoverable recursive probing: ret_from_exception ret_from_intr check_userspace restore_all restore_all_notrace restore_nocheck irq_return And also, I found some interrupt/exception entry points that cause similar problems. This patch moves those symbols (including their container functions) to .kprobes.text section to prevent any kprobes probing. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Ingo Molnar LKML-Reference: <20090908164755.24050.81182.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/entry_32.S | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c097e7d607c6..beb30da203d6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -333,6 +333,10 @@ ENTRY(ret_from_fork) CFI_ENDPROC END(ret_from_fork) +/* + * Interrupt exit functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" /* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to @@ -383,6 +387,10 @@ need_resched: END(resume_kernel) #endif CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ @@ -513,6 +521,10 @@ sysexit_audit: PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) +/* + * syscall stub including irq exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway @@ -705,6 +717,10 @@ syscall_badsys: jmp resume_userspace END(syscall_badsys) CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* * System calls that need a pt_regs pointer. @@ -814,6 +830,10 @@ common_interrupt: ENDPROC(common_interrupt) CFI_ENDPROC +/* + * Irq entries should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ @@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug) jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) +/* + * End of kprobes section + */ + .popsection ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder -- cgit v1.2.1 From ad5cafcdb09c57008c990edd309c0a563b09f238 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 10 Sep 2009 19:53:06 -0400 Subject: x86/ptrace: Fix regs_get_argument_nth() to add correct offset Fix regs_get_argument_nth() to add correct offset bytes. Because offset_of() returns offset in byte, the offset should be added to char * instead of unsigned long *. Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Andi Kleen Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Jason Baron Cc: K.Prasad Cc: Lai Jiangshan Cc: Li Zefan Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <20090910235306.22412.31613.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a33a17d5d5c8..caffb6809452 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -150,7 +150,7 @@ static const int arg_offs_table[] = { unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) { if (n < ARRAY_SIZE(arg_offs_table)) - return *((unsigned long *)regs + arg_offs_table[n]); + return *(unsigned long *)((char *)regs + arg_offs_table[n]); else { /* * The typical case: arg n is on the stack. -- cgit v1.2.1 From 9f0cf4adb6aa0bfccf675c938124e68f7f06349d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 26 Sep 2009 14:33:01 +0200 Subject: x86: Use __builtin_object_size() to validate the buffer size for copy_from_user() gcc (4.x) supports the __builtin_object_size() builtin, which reports the size of an object that a pointer point to, when known at compile time. If the buffer size is not known at compile time, a constant -1 is returned. This patch uses this feature to add a sanity check to copy_from_user(); if the target buffer is known to be smaller than the copy size, the copy is aborted and a WARNing is emitted in memory debug mode. These extra checks compile away when the object size is not known, or if both the buffer size and the copy length are constants. Signed-off-by: Arjan van de Ven LKML-Reference: <20090926143301.2c396b94@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/x8664_ksyms_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 3909e3ba5ce3..a0cdd8cc1d67 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -30,7 +30,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(__copy_user_nocache); -EXPORT_SYMBOL(copy_from_user); +EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(copy_to_user); EXPORT_SYMBOL(__copy_from_user_inatomic); -- cgit v1.2.1 From 7c68af6e32c73992bad24107311f3433c89016e2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 19 Sep 2009 09:40:22 +0300 Subject: core, x86: Add user return notifiers Add a general per-cpu notifier that is called whenever the kernel is about to return to userspace. The notifier uses a thread_info flag and existing checks, so there is no impact on user return or context switch fast paths. This will be used initially to speed up KVM task switching by lazily updating MSRs. Signed-off-by: Avi Kivity LKML-Reference: <1253342422-13811-1-git-send-email-avi@redhat.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process.c | 2 ++ arch/x86/kernel/signal.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5284cd2b5776..e51b056fc88f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -224,6 +225,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } + propagate_user_return_notify(prev_p, next_p); } int sys_fork(struct pt_regs *regs) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 6a44a76055ad..c49f90f7957a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -872,6 +873,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (current->replacement_session_keyring) key_replace_session_keyring(); } + if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); -- cgit v1.2.1 From d6c304055b3cecd4ca865769ac7cea97a320727b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 7 Oct 2009 21:43:22 +0200 Subject: x86, msr: Remove the bkl from msr_open() Remove the big kernel lock from msr_open() as it doesn't protect anything there. The only racy event that can happen here is a concurrent cpu shutdown. So let's look at what could be racy during/after the above event: - The cpu_online() check is racy, but the bkl doesn't help about that anyway it disables preemption but we may be chcking another cpu than the current one. Also the cpu can still become offlined between open and read calls. - The cpu_data(cpu) returns a safe pointer too. It won't be released on cpu offlining. But some fields can be changed from arch/x86/kernel/smpboot.c:remove_siblinginfo() : - phys_proc_id - cpu_core_id Those are not read from msr_open(). What we are checking is the x86_capability that is left untouched on offlining. So this removal looks safe. Signed-off-by: Frederic Weisbecker Cc: John Kacur Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Sven-Thorsten Dietrich LKML-Reference: <1254944602-7382-1-git-send-email-fweisbec@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/msr.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7dd950094178..c00610963238 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -174,21 +174,17 @@ static int msr_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); struct cpuinfo_x86 *c = &cpu_data(cpu); - int ret = 0; - lock_kernel(); cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { - ret = -ENXIO; /* No such CPU */ - goto out; - } + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + c = &cpu_data(cpu); if (!cpu_has(c, X86_FEATURE_MSR)) - ret = -EIO; /* MSR not supported */ -out: - unlock_kernel(); - return ret; + return -EIO; /* MSR not supported */ + + return 0; } /* -- cgit v1.2.1 From 170a0bc3808909d8ea0f3f9c725c6565efe7f9c4 Mon Sep 17 00:00:00 2001 From: John Kacur Date: Wed, 7 Oct 2009 20:19:32 +0200 Subject: x86, cpuid: Remove the bkl from cpuid_open() Most of the variables are local to the function. It IS possible that for struct cpuinfo_x86 *c c could point to the same area. However, this is used read only. Signed-off-by: John Kacur LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpuid.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index b07af8861244..ef6928418c8f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -118,8 +118,6 @@ static int cpuid_open(struct inode *inode, struct file *file) struct cpuinfo_x86 *c; int ret = 0; - lock_kernel(); - cpu = iminor(file->f_path.dentry->d_inode); if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { ret = -ENXIO; /* No such CPU */ @@ -129,7 +127,6 @@ static int cpuid_open(struct inode *inode, struct file *file) if (c->cpuid_level < 0) ret = -EIO; /* CPUID not supported */ out: - unlock_kernel(); return ret; } -- cgit v1.2.1 From d0153ca35d344d9b640dc305031b0703ba3f30f0 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Tue, 29 Sep 2009 10:25:24 -0700 Subject: x86, vmi: Mark VMI deprecated and schedule it for removal Add text in feature-removal.txt indicating that VMI will be removed in the 2.6.37 timeframe. Signed-off-by: Alok N Kataria Acked-by: Chris Wright LKML-Reference: <1254193238.13456.48.camel@ank32.eng.vmware.com> [ removed a bogus Kconfig change, marked (DEPRECATED) in Kconfig ] Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmi_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 31e6f6cfe53e..d430e4c30193 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -648,7 +648,7 @@ static inline int __init activate_vmi(void) pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; - pv_info.name = "vmi"; + pv_info.name = "vmi [deprecated]"; pv_init_ops.patch = vmi_patch; -- cgit v1.2.1 From 5a943617ef52e9f79cd7cf437aad8870be27aabb Mon Sep 17 00:00:00 2001 From: John Kacur Date: Thu, 8 Oct 2009 17:20:15 +0200 Subject: x86, cpuid: Simplify the code in cpuid_open Peter picked up my patch for tip/x86/cpu that removes the bkl in cpuid_open. Ingo subsequently merged that into tip/master. This patch folds back in tglx's 55968ede164ae523692f00717f50cd926f1382a0 to my patch that removed the bkl. This simplifies the code, and makes it consistent with the changes to kill the bkl in msr.c as well. Originally-by: Thomas Gleixner Signed-off-by: John Kacur Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpuid.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index ef6928418c8f..48e8e6558b26 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -116,18 +116,16 @@ static int cpuid_open(struct inode *inode, struct file *file) { unsigned int cpu; struct cpuinfo_x86 *c; - int ret = 0; cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { - ret = -ENXIO; /* No such CPU */ - goto out; - } + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + c = &cpu_data(cpu); if (c->cpuid_level < 0) - ret = -EIO; /* CPUID not supported */ -out: - return ret; + return -EIO; /* CPUID not supported */ + + return 0; } /* -- cgit v1.2.1 From 04a705df47d1ea27ca2b066f24b1951c51792d0d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 6 Oct 2009 16:42:08 +0200 Subject: perf_events: Check for filters on fixed counter events Intel fixed counters do not support all the filters possible with a generic counter. Thus, if a fixed counter event is passed but with certain filters set, then the fixed_mode_idx() function must fail and the event must be measured in a generic counter instead. Reject filters are: inv, edge, cnt-mask. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1254840129-6198-2-git-send-email-eranian@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b5801c311846..1d16bd69551e 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1349,6 +1349,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) if (!x86_pmu.num_events_fixed) return -1; + /* + * fixed counters do not take all possible filters + */ + if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) + return -1; + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) -- cgit v1.2.1 From b690081d4d3f6a23541493f1682835c3cd5c54a1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 6 Oct 2009 16:42:09 +0200 Subject: perf_events: Add event constraints support for Intel processors On some Intel processors, not all events can be measured in all counters. Some events can only be measured in one particular counter, for instance. Assigning an event to the wrong counter does not crash the machine but this yields bogus counts, i.e., silent error. This patch changes the event to counter assignment logic to take into account event constraints for Intel P6, Core and Nehalem processors. There is no contraints on Intel Atom. There are constraints on Intel Yonah (Core Duo) but they are not provided in this patch given that this processor is not yet supported by perf_events. As a result of the constraints, it is possible for some event groups to never actually be loaded onto the PMU if they contain two events which can only be measured on a single counter. That situation can be detected with the scaling information extracted with read(). Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <1254840129-6198-3-git-send-email-eranian@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 109 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1d16bd69551e..9c758548a0e6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -77,6 +77,18 @@ struct cpu_hw_events { struct debug_store *ds; }; +struct event_constraint { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int code; +}; + +#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } +#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->idxmsk[0]; (e)++) + + /* * struct x86_pmu - generic x86 pmu */ @@ -102,6 +114,7 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); + int (*get_event_idx)(struct hw_perf_event *hwc); }; static struct x86_pmu x86_pmu __read_mostly; @@ -110,6 +123,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; +static const struct event_constraint *event_constraint; + /* * Not sure about some of these */ @@ -155,6 +170,16 @@ static u64 p6_pmu_raw_event(u64 hw_event) return hw_event & P6_EVNTSEL_MASK; } +static const struct event_constraint intel_p6_event_constraints[] = +{ + EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; /* * Intel PerfMon v3. Used on Core2 and later. @@ -170,6 +195,35 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; +static const struct event_constraint intel_core_event_constraints[] = +{ + EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static const struct event_constraint intel_nehalem_event_constraints[] = +{ + EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */ + EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */ + EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -932,6 +986,8 @@ static int __hw_perf_event_init(struct perf_event *event) */ hwc->config = ARCH_PERFMON_EVENTSEL_INT; + hwc->idx = -1; + /* * Count user and OS events unless requested not to. */ @@ -1365,6 +1421,45 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) return -1; } +/* + * generic counter allocator: get next free counter + */ +static int gen_get_event_idx(struct hw_perf_event *hwc) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); + return idx == x86_pmu.num_events ? -1 : idx; +} + +/* + * intel-specific counter allocator: check event constraints + */ +static int intel_get_event_idx(struct hw_perf_event *hwc) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + const struct event_constraint *event_constraint; + int i, code; + + if (!event_constraint) + goto skip; + + code = hwc->config & 0xff; + + for_each_event_constraint(event_constraint, event_constraint) { + if (code == event_constraint->code) { + for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { + if (!test_and_set_bit(i, cpuc->used_mask)) + return i; + } + return -1; + } + } +skip: + return gen_get_event_idx(hwc); +} + /* * Find a PMC slot for the freshly enabled / scheduled in event: */ @@ -1402,11 +1497,10 @@ static int x86_pmu_enable(struct perf_event *event) } else { idx = hwc->idx; /* Try to get the previous generic event again */ - if (test_and_set_bit(idx, cpuc->used_mask)) { + if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_events); - if (idx == x86_pmu.num_events) + idx = x86_pmu.get_event_idx(hwc); + if (idx == -1) return -EAGAIN; set_bit(idx, cpuc->used_mask); @@ -1883,6 +1977,7 @@ static struct x86_pmu p6_pmu = { */ .event_bits = 32, .event_mask = (1ULL << 32) - 1, + .get_event_idx = intel_get_event_idx, }; static struct x86_pmu intel_pmu = { @@ -1906,6 +2001,7 @@ static struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, .disable_bts = intel_pmu_disable_bts, + .get_event_idx = intel_get_event_idx, }; static struct x86_pmu amd_pmu = { @@ -1926,6 +2022,7 @@ static struct x86_pmu amd_pmu = { .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, + .get_event_idx = gen_get_event_idx, }; static int p6_pmu_init(void) @@ -1938,10 +2035,12 @@ static int p6_pmu_init(void) case 7: case 8: case 11: /* Pentium III */ + event_constraint = intel_p6_event_constraints; break; case 9: case 13: /* Pentium M */ + event_constraint = intel_p6_event_constraints; break; default: pr_cont("unsupported p6 CPU model %d ", @@ -2013,12 +2112,14 @@ static int intel_pmu_init(void) sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); + event_constraint = intel_core_event_constraints; break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + event_constraint = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; case 28: -- cgit v1.2.1 From fe9081cc9bdabb0be953a39ad977cea14e35bce5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Oct 2009 11:56:07 +0200 Subject: perf, x86: Add simple group validation Refuse to add events when the group wouldn't fit onto the PMU anymore. Naive implementation. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <1254911461.26976.239.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 90 ++++++++++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9c758548a0e6..9961d845719d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -114,7 +114,8 @@ struct x86_pmu { u64 intel_ctrl; void (*enable_bts)(u64 config); void (*disable_bts)(void); - int (*get_event_idx)(struct hw_perf_event *hwc); + int (*get_event_idx)(struct cpu_hw_events *cpuc, + struct hw_perf_event *hwc); }; static struct x86_pmu x86_pmu __read_mostly; @@ -523,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event) #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL #define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL #define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ @@ -1390,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) x86_pmu_enable_event(hwc, idx); } -static int -fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) +static int fixed_mode_idx(struct hw_perf_event *hwc) { unsigned int hw_event; @@ -1424,9 +1424,9 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) /* * generic counter allocator: get next free counter */ -static int gen_get_event_idx(struct hw_perf_event *hwc) +static int +gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events); @@ -1436,16 +1436,16 @@ static int gen_get_event_idx(struct hw_perf_event *hwc) /* * intel-specific counter allocator: check event constraints */ -static int intel_get_event_idx(struct hw_perf_event *hwc) +static int +intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); const struct event_constraint *event_constraint; int i, code; if (!event_constraint) goto skip; - code = hwc->config & 0xff; + code = hwc->config & CORE_EVNTSEL_EVENT_MASK; for_each_event_constraint(event_constraint, event_constraint) { if (code == event_constraint->code) { @@ -1457,26 +1457,22 @@ static int intel_get_event_idx(struct hw_perf_event *hwc) } } skip: - return gen_get_event_idx(hwc); + return gen_get_event_idx(cpuc, hwc); } -/* - * Find a PMC slot for the freshly enabled / scheduled in event: - */ -static int x86_pmu_enable(struct perf_event *event) +static int +x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; int idx; - idx = fixed_mode_idx(event, hwc); + idx = fixed_mode_idx(hwc); if (idx == X86_PMC_IDX_FIXED_BTS) { /* BTS is already occupied. */ if (test_and_set_bit(idx, cpuc->used_mask)) return -EAGAIN; hwc->config_base = 0; - hwc->event_base = 0; + hwc->event_base = 0; hwc->idx = idx; } else if (idx >= 0) { /* @@ -1499,17 +1495,33 @@ static int x86_pmu_enable(struct perf_event *event) /* Try to get the previous generic event again */ if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = x86_pmu.get_event_idx(hwc); + idx = x86_pmu.get_event_idx(cpuc, hwc); if (idx == -1) return -EAGAIN; set_bit(idx, cpuc->used_mask); hwc->idx = idx; } - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; } + return idx; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in event: + */ +static int x86_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx; + + idx = x86_schedule_event(cpuc, hwc); + if (idx < 0) + return idx; + perf_events_lapic_init(); x86_pmu.disable(hwc, idx); @@ -2212,11 +2224,47 @@ static const struct pmu pmu = { .unthrottle = x86_pmu_unthrottle, }; +static int +validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event fake_event = event->hw; + + if (event->pmu != &pmu) + return 0; + + return x86_schedule_event(cpuc, &fake_event); +} + +static int validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct cpu_hw_events fake_pmu; + + memset(&fake_pmu, 0, sizeof(fake_pmu)); + + if (!validate_event(&fake_pmu, leader)) + return -ENOSPC; + + list_for_each_entry(sibling, &leader->sibling_list, group_entry) { + if (!validate_event(&fake_pmu, sibling)) + return -ENOSPC; + } + + if (!validate_event(&fake_pmu, event)) + return -ENOSPC; + + return 0; +} + const struct pmu *hw_perf_event_init(struct perf_event *event) { int err; err = __hw_perf_event_init(event); + if (!err) { + if (event->group_leader != event) + err = validate_group(event); + } if (err) { if (event->destroy) event->destroy(event); -- cgit v1.2.1 From e7ab0f7b50bc4688fb5cf65de5d42e3b882fb8d1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 Oct 2009 15:58:20 +0200 Subject: Revert "x86, timers: Check for pending timers after (device) interrupts" This reverts commit 9bcbdd9c58617f1301dd4f17c738bb9bc73aca70. The real bug producing LatencyTop latencies has been fixed in: f5dc375: sched: Update the clock of runqueue select_task_rq() selected And the commit being reverted here triggers local timer processing from every device IRQ. If device IRQs come in at a high frequency, this could cause a performance regression. The commit being reverted here purely 'fixed' the reported latency as a side effect, because CPUs were being moved out of idle more often. Acked-by: Peter Zijlstra Cc: Arjan van de Ven Cc: Frans Pop Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner LKML-Reference: <20091008064041.67219b13@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 -- arch/x86/kernel/smp.c | 1 - 2 files changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 391206199515..74656d1d4e30 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -244,7 +244,6 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) __func__, smp_processor_id(), vector, irq); } - run_local_timers(); irq_exit(); set_irq_regs(old_regs); @@ -269,7 +268,6 @@ void smp_generic_interrupt(struct pt_regs *regs) if (generic_interrupt_extension) generic_interrupt_extension(); - run_local_timers(); irq_exit(); set_irq_regs(old_regs); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d915d956e66d..ec1de97600e7 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -198,7 +198,6 @@ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); - run_local_timers(); /* * KVM uses this interrupt to force a cpu out of guest mode */ -- cgit v1.2.1 From c5cca146aa03e1f60fb179df65f0dbaf17bc64ed Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 9 Oct 2009 18:31:20 +0200 Subject: x86/amd-iommu: Workaround for erratum 63 There is an erratum for IOMMU hardware which documents undefined behavior when forwarding SMI requests from peripherals and the DTE of that peripheral has a sysmgt value of 01b. This problem caused weird IO_PAGE_FAULTS in my case. This patch implements the suggested workaround for that erratum into the AMD IOMMU driver. The erratum is documented with number 63. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 ++ arch/x86/kernel/amd_iommu_init.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 6c99f5037801..f95dfe526444 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1114,6 +1114,8 @@ static void __detach_device(struct protection_domain *domain, u16 devid) amd_iommu_dev_table[devid].data[1] = 0; amd_iommu_dev_table[devid].data[2] = 0; + amd_iommu_apply_erratum_63(devid); + /* decrease reference counter */ domain->dev_cnt -= 1; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c1b17e97252e..498c8c79aaa5 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -509,6 +509,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } +static int get_dev_entry_bit(u16 devid, u8 bit) +{ + int i = (bit >> 5) & 0x07; + int _bit = bit & 0x1f; + + return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit; +} + + +void amd_iommu_apply_erratum_63(u16 devid) +{ + int sysmgt; + + sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) | + (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1); + + if (sysmgt == 0x01) + set_dev_entry_bit(devid, DEV_ENTRY_IW); +} + /* Writes the specific IOMMU for a device into the rlookup table */ static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) { @@ -537,6 +557,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, if (flags & ACPI_DEVFLAG_LINT1) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); + amd_iommu_apply_erratum_63(devid); + set_iommu_for_device(iommu, devid); } -- cgit v1.2.1 From a6f05a6a0a1713d5b019f096799d49226807d3df Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 8 Oct 2009 18:02:54 -0700 Subject: x86-64: make compat_start_thread() match start_thread() For no real good reason, compat_start_thread() was embedded inline in whereas the native start_thread() lives in process_*.c. Move compat_start_thread() to process_64.c, remove gratuitious differences, and fix a few items which mostly look like bit rot. In particular, compat_start_thread() didn't do free_thread_xstate(), which means it was hanging on to the xstate store area even when it was not needed. It was also not setting old_rsp, but it looks like that generally shouldn't matter for a 32-bit process. Note: compat_start_thread *has* to be a macro, since it is tested with start_thread_ia32() as the out of line function name. Signed-off-by: H. Peter Anvin Acked-by: Suresh Siddha --- arch/x86/kernel/process_64.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad535b683170..7cf0a6b6d4bb 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -356,7 +356,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) percpu_write(old_rsp, new_sp); regs->cs = __USER_CS; regs->ss = __USER_DS; - regs->flags = 0x200; + regs->flags = X86_EFLAGS_IF; set_fs(USER_DS); /* * Free the old FP and other extended state @@ -365,6 +365,27 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) } EXPORT_SYMBOL_GPL(start_thread); +#ifdef CONFIG_IA32_EMULATION +void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) +{ + loadsegment(fs, 0); + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); + load_gs_index(0); + regs->ip = new_ip; + regs->sp = new_sp; + percpu_write(old_rsp, new_sp); + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; + regs->flags = X86_EFLAGS_IF; + set_fs(USER_DS); + /* + * Free the old FP and other extended state + */ + free_thread_xstate(current); +} +#endif + /* * switch_to(x,y) should switch tasks from x to y. * -- cgit v1.2.1 From e634d8fc792c66c3d4ff45518c04848c1e28f221 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 9 Oct 2009 15:56:53 -0700 Subject: x86-64: merge the standard and compat start_thread() functions The only thing left that differs between the standard and compat start_thread functions is the actual segment numbers and the prototype, so have a single common function which contains the guts and two very small wrappers. Signed-off-by: H. Peter Anvin Acked-by: Suresh Siddha --- arch/x86/kernel/process_64.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7cf0a6b6d4bb..eb261c582a44 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -344,18 +344,20 @@ out: return err; } -void -start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +static void +start_thread_common(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp, + unsigned int _cs, unsigned int _ss, unsigned int _ds) { loadsegment(fs, 0); - loadsegment(es, 0); - loadsegment(ds, 0); + loadsegment(es, _ds); + loadsegment(ds, _ds); load_gs_index(0); regs->ip = new_ip; regs->sp = new_sp; percpu_write(old_rsp, new_sp); - regs->cs = __USER_CS; - regs->ss = __USER_DS; + regs->cs = _cs; + regs->ss = _ss; regs->flags = X86_EFLAGS_IF; set_fs(USER_DS); /* @@ -363,26 +365,19 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) */ free_thread_xstate(current); } -EXPORT_SYMBOL_GPL(start_thread); + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER_CS, __USER_DS, 0); +} #ifdef CONFIG_IA32_EMULATION void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) { - loadsegment(fs, 0); - loadsegment(ds, __USER32_DS); - loadsegment(es, __USER32_DS); - load_gs_index(0); - regs->ip = new_ip; - regs->sp = new_sp; - percpu_write(old_rsp, new_sp); - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; - regs->flags = X86_EFLAGS_IF; - set_fs(USER_DS); - /* - * Free the old FP and other extended state - */ - free_thread_xstate(current); + start_thread_common(regs, new_ip, new_sp, + __USER32_CS, __USER32_DS, __USER32_DS); } #endif -- cgit v1.2.1 From 3bb258bf430d29a24350fe4f44f8bf07b7b7a8f6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 4 Oct 2009 17:53:29 -0700 Subject: ftrace.c: Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Remove prefixes from pr_, use pr_fmt(fmt). No change in output. Signed-off-by: Joe Perches Acked-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <9b377eefae9e28c599dd4a17bdc81172965e9931.1254701151.git.joe@perches.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 9dbb527e1652..25e6f5fc4b1e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -9,6 +9,8 @@ * the dangers of modifying code on the run. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -336,15 +338,15 @@ int __init ftrace_dyn_arch_init(void *data) switch (faulted) { case 0: - pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); + pr_info("converting mcount calls to 0f 1f 44 00 00\n"); memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); break; case 1: - pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); + pr_info("converting mcount calls to 66 66 66 66 90\n"); memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); break; case 2: - pr_info("ftrace: converting mcount calls to jmp . + 5\n"); + pr_info("converting mcount calls to jmp . + 5\n"); memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); break; } -- cgit v1.2.1 From fb2531953fd8855abdcf458459020fd382c5deca Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 7 Oct 2009 13:20:38 +0200 Subject: mce, edac: Use an atomic notifier for MCEs decoding Add an atomic notifier which ensures proper locking when conveying MCE info to EDAC for decoding. The actual notifier call overrides a default, negative priority notifier. Note: make sure we register the default decoder only once since mcheck_init() runs on each CPU. Signed-off-by: Borislav Petkov LKML-Reference: <20091003065752.GA8935@liondog.tnic> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b1598a9436d0..15ba9c972d7a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -85,18 +85,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -static void default_decode_mce(struct mce *m) +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); + +static int default_decode_mce(struct notifier_block *nb, unsigned long val, + void *data) { pr_emerg("No human readable MCE decoding support on this CPU type.\n"); pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); + + return NOTIFY_STOP; } -/* - * CPU/chipset specific EDAC code can register a callback here to print - * MCE errors in a human-readable form: - */ -void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce; -EXPORT_SYMBOL(x86_mce_decode_callback); +static struct notifier_block mce_dec_nb = { + .notifier_call = default_decode_mce, + .priority = -1, +}; /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { @@ -204,9 +212,9 @@ static void print_mce(struct mce *m) /* * Print out human-readable details about the MCE error, - * (if the CPU has an implementation for that): + * (if the CPU has an implementation for that) */ - x86_mce_decode_callback(m); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); } static void print_mce_head(void) @@ -1420,6 +1428,9 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) mce_cpu_features(c); mce_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); + + if (raw_smp_processor_id() == 0) + atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); } /* -- cgit v1.2.1 From 494f6a9e12f5137d355d3ce3f5789ef148b642bc Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 7 Oct 2009 19:04:29 -0400 Subject: this_cpu: Use this_cpu_xx in nmi handling this_cpu_inc/dec reduces the number of instructions needed. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- arch/x86/kernel/apic/nmi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 7ff61d6a188a..e631cc4416f7 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -360,7 +360,7 @@ void stop_apic_nmi_watchdog(void *unused) */ static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(local_t, alert_counter); +static DEFINE_PER_CPU(long, alert_counter); static DEFINE_PER_CPU(int, nmi_touch); void touch_nmi_watchdog(void) @@ -437,8 +437,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) + __this_cpu_inc(per_cpu_var(alert_counter)); + if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ @@ -446,7 +446,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; - local_set(&__get_cpu_var(alert_counter), 0); + __this_cpu_write(per_cpu_var(alert_counter), 0); } /* see if the nmi watchdog went off */ -- cgit v1.2.1 From 9a821b231644028f8e2a853eb33d1184e925b183 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 12 Oct 2009 12:59:29 +0100 Subject: x86: Move pci_iommu_init to rootfs_initcall() We want this to happen after the PCI quirks, which are now running at the very end of the fs_initcalls. This works around the BIOS problems which were originally addressed by commit db8be50c4307dac2b37305fc59c8dc0f978d09ea ('USB: Work around BIOS bugs by quiescing USB controllers earlier'), which was reverted in commit d93a8f829fe1d2f3002f2c6ddb553d12db420412. Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 64b838eac18c..e0d9199d0eb9 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -311,7 +311,7 @@ void pci_iommu_shutdown(void) amd_iommu_shutdown(); } /* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); +rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ -- cgit v1.2.1 From 7a4b7e5e741fe0a72a517b0367a2659aa53f7c44 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 6 Oct 2009 16:32:43 +0100 Subject: x86: Fix Suspend to RAM freeze on Acer Aspire 1511Lmi laptop Move the trampoline and accessors back out of .cpuinit.* for the case of 64-bits+ACPI_SLEEP. This solves s2ram hangs reported in: http://bugzilla.kernel.org/show_bug.cgi?id=14279 Reported-and-bisected-by: Christian Casteyde Signed-off-by: Jan Beulich Cc: Cc: "Andrew Morton" Cc: "Rafael J. Wysocki" Signed-off-by: Ingo Molnar --- arch/x86/kernel/trampoline.c | 12 ++++++++++-- arch/x86/kernel/trampoline_64.S | 4 ++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 699f7eeb896a..cd022121cab6 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -3,8 +3,16 @@ #include #include +#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) +#define __trampinit +#define __trampinitdata +#else +#define __trampinit __cpuinit +#define __trampinitdata __cpuinitdata +#endif + /* ready for x86_64 and x86 */ -unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE); +unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE); void __init reserve_trampoline_memory(void) { @@ -26,7 +34,7 @@ void __init reserve_trampoline_memory(void) * bootstrap into the page concerned. The caller * has made sure it's suitably aligned. */ -unsigned long __cpuinit setup_trampoline(void) +unsigned long __trampinit setup_trampoline(void) { memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 596d54c660a5..3af2dff58b21 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -32,8 +32,12 @@ #include #include +#ifdef CONFIG_ACPI_SLEEP +.section .rodata, "a", @progbits +#else /* We can free up the trampoline after bootup if cpu hotplug is not supported. */ __CPUINITRODATA +#endif .code16 ENTRY(trampoline_data) -- cgit v1.2.1 From ae24ffe5ecec17c956ac25371d7c2e12b4b36e53 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 12 Oct 2009 10:18:23 -0400 Subject: x86, 64-bit: Move K8 B step iret fixup to fault entry asm Move the handling of truncated %rip from an iret fault to the fault entry path. This allows x86-64 to use the standard search_extable() function. Signed-off-by: Brian Gerst Cc: Linus Torvalds Cc: Jan Beulich LKML-Reference: <1255357103-5418-1-git-send-email-brgerst@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b5c061f8f358..af0f4b226dbe 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1491,12 +1491,17 @@ error_kernelspace: leaq irq_return(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs - movl %ecx,%ecx /* zero extend */ - cmpq %rcx,RIP+8(%rsp) - je error_swapgs + movl %ecx,%eax /* zero extend */ + cmpq %rax,RIP+8(%rsp) + je bstep_iret cmpq $gs_change,RIP+8(%rsp) je error_swapgs jmp error_sti + +bstep_iret: + /* Fix truncated RIP */ + movq %rcx,RIP+8(%rsp) + je error_swapgs END(error_entry) -- cgit v1.2.1 From d1705c558c95418378b11a0be963fe1b3e2fa381 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 12 Oct 2009 11:32:31 -0700 Subject: x86: fix kernel panic on 32 bits when profiling Latest kernel has a kernel panic in booting on i386 machine when profile=2 setting in cmdline. It is due to 'sp' being incorrect in profile_pc(). BUG: unable to handle kernel NULL pointer dereference at 00000246 IP: [] profile_pc+0x2a/0x48 *pde = 00000000 Oops: 0000 [#1] SMP This differs from the original version by Alex Shi in that we use the kernel_stack_pointer() inline already defined in for this purpose, instead of #ifdef. Originally-by: Alex Shi Cc: "Chen, Tim C" Cc: "Rafael J. Wysocki" Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/kernel/time.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index dcb00d278512..be2573448ed9 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -38,7 +38,8 @@ unsigned long profile_pc(struct pt_regs *regs) #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else - unsigned long *sp = (unsigned long *)regs->sp; + unsigned long *sp = + (unsigned long *)kernel_stack_pointer(regs); /* * Return address is either directly at stack pointer * or above a saved flags. Eflags has bits 22-31 zero, -- cgit v1.2.1 From def3c5d0a34e4b09b3cea4435c17209ad347104d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 12 Oct 2009 14:09:07 -0700 Subject: x86: use kernel_stack_pointer() in process_32.c The way to obtain a kernel-mode stack pointer from a struct pt_regs in 32-bit mode is "subtle": the stack doesn't actually contain the stack pointer, but rather the location where it would have been marks the actual previous stack frame. For clarity, use kernel_stack_pointer() instead of coding this weirdness explicitly. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4cf79567cdab..35e6fad73e0d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -134,7 +134,7 @@ void __show_regs(struct pt_regs *regs, int all) ss = regs->ss & 0xffff; gs = get_user_gs(regs); } else { - sp = (unsigned long) (®s->sp); + sp = kernel_stack_pointer(regs); savesegment(ss, ss); savesegment(gs, gs); } -- cgit v1.2.1 From a343c75d338aa2afaea4a2a8e40de9e67b6fb4a7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 12 Oct 2009 14:11:09 -0700 Subject: x86: use kernel_stack_pointer() in dumpstack.c The way to obtain a kernel-mode stack pointer from a struct pt_regs in 32-bit mode is "subtle": the stack doesn't actually contain the stack pointer, but rather the location where it would have been marks the actual previous stack frame. For clarity, use kernel_stack_pointer() instead of coding this weirdness explicitly. Furthermore, user_mode() is only valid when the process is known to not run in V86 mode. Use the safer user_mode_vm() instead. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/dumpstack.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2d8a371d4339..b8ce165dde5d 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -268,11 +268,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) show_registers(regs); #ifdef CONFIG_X86_32 - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { + if (user_mode_vm(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; + } else { + sp = kernel_stack_pointer(regs); + savesegment(ss, ss); } printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); print_symbol("%s", regs->ip); -- cgit v1.2.1 From 5ca6c0ca5dbf105d7b0ffdae2289519982189730 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 12 Oct 2009 14:12:18 -0700 Subject: x86: use kernel_stack_pointer() in kgdb.c The way to obtain a kernel-mode stack pointer from a struct pt_regs in 32-bit mode is "subtle": the stack doesn't actually contain the stack pointer, but rather the location where it would have been marks the actual previous stack frame. For clarity, use kernel_stack_pointer() instead of coding this weirdness explicitly. Signed-off-by: H. Peter Anvin Cc: Jason Wessel --- arch/x86/kernel/kgdb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8d82a77a3f3b..3310d849abd2 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -88,7 +88,6 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; - gdb_regs[GDB_SP] = (int)®s->sp; #else gdb_regs[GDB_R8] = regs->r8; gdb_regs[GDB_R9] = regs->r9; @@ -101,8 +100,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs32[GDB_PS] = regs->flags; gdb_regs32[GDB_CS] = regs->cs; gdb_regs32[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = regs->sp; #endif + gdb_regs[GDB_SP] = kernel_stack_pointer(regs); } /** -- cgit v1.2.1 From 98272ed0d2e6509fe7dc571e77956c99bf653bb6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 12 Oct 2009 14:14:10 -0700 Subject: x86: use kernel_stack_pointer() in kprobes.c The way to obtain a kernel-mode stack pointer from a struct pt_regs in 32-bit mode is "subtle": the stack doesn't actually contain the stack pointer, but rather the location where it would have been marks the actual previous stack frame. For clarity, use kernel_stack_pointer() instead of coding this weirdness explicitly. Signed-off-by: H. Peter Anvin Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu --- arch/x86/kernel/kprobes.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..2ee4fa3a3f01 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -60,19 +60,7 @@ void jprobe_return_end(void); DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#ifdef CONFIG_X86_64 -#define stack_addr(regs) ((unsigned long *)regs->sp) -#else -/* - * "®s->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs - * don't save the ss and esp registers if the CPU is already in kernel - * mode when it traps. So for kprobes, regs->sp and regs->ss are not - * the [nonexistent] saved stack pointer and ss register, but rather - * the top 8 bytes of the pre-int3 stack. So ®s->sp happens to - * point to the top of the pre-int3 stack. - */ -#define stack_addr(regs) ((unsigned long *)®s->sp) -#endif +#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs)) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ -- cgit v1.2.1 From 7a693d3f0d10f978ebdf3082c41404ab97106567 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 13 Oct 2009 08:16:30 +0200 Subject: perf_events, x86: Fix event constraints code There was namespace overlap due to a rename i did - this caused the following build warning, reported by Stephen Rothwell against linux-next x86_64 allmodconfig: arch/x86/kernel/cpu/perf_event.c: In function 'intel_get_event_idx': arch/x86/kernel/cpu/perf_event.c:1445: warning: 'event_constraint' is used uninitialized in this function This is a real bug not just a warning: fix it by renaming the global event-constraints table pointer to 'event_constraints'. Reported-by: Stephen Rothwell Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20091013144223.369d616d.sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9961d845719d..2e20bca3cca1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -124,7 +124,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static const struct event_constraint *event_constraint; +static const struct event_constraint *event_constraints; /* * Not sure about some of these @@ -1442,12 +1442,12 @@ intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) const struct event_constraint *event_constraint; int i, code; - if (!event_constraint) + if (!event_constraints) goto skip; code = hwc->config & CORE_EVNTSEL_EVENT_MASK; - for_each_event_constraint(event_constraint, event_constraint) { + for_each_event_constraint(event_constraint, event_constraints) { if (code == event_constraint->code) { for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) { if (!test_and_set_bit(i, cpuc->used_mask)) @@ -2047,12 +2047,12 @@ static int p6_pmu_init(void) case 7: case 8: case 11: /* Pentium III */ - event_constraint = intel_p6_event_constraints; + event_constraints = intel_p6_event_constraints; break; case 9: case 13: /* Pentium M */ - event_constraint = intel_p6_event_constraints; + event_constraints = intel_p6_event_constraints; break; default: pr_cont("unsupported p6 CPU model %d ", @@ -2124,14 +2124,14 @@ static int intel_pmu_init(void) sizeof(hw_cache_event_ids)); pr_cont("Core2 events, "); - event_constraint = intel_core_event_constraints; + event_constraints = intel_core_event_constraints; break; default: case 26: memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - event_constraint = intel_nehalem_event_constraints; + event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; case 28: -- cgit v1.2.1 From a2e2725541fad72416326798c2d7fa4dafb7d337 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 12 Oct 2009 23:40:10 -0700 Subject: net: Introduce recvmmsg socket syscall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Meaning receive multiple messages, reducing the number of syscalls and net stack entry/exit operations. Next patches will introduce mechanisms where protocols that want to optimize this operation will provide an unlocked_recvmsg operation. This takes into account comments made by: . Paul Moore: sock_recvmsg is called only for the first datagram, sock_recvmsg_nosec is used for the rest. . Caitlin Bestler: recvmmsg now has a struct timespec timeout, that works in the same fashion as the ppoll one. If the underlying protocol returns a datagram with MSG_OOB set, this will make recvmmsg return right away with as many datagrams (+ the OOB one) it has received so far. . Rémi Denis-Courmont & Steven Whitehouse: If we receive N < vlen datagrams and then recvmsg returns an error, recvmmsg will return the successfully received datagrams, store the error and return it in the next call. This paves the way for a subsequent optimization, sk_prot->unlocked_recvmsg, where we will be able to acquire the lock only at batch start and end, not at every underlying recvmsg call. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 0157cd26d7cc..70c2125d55b9 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -336,3 +336,4 @@ ENTRY(sys_call_table) .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open + .long sys_recvmmsg -- cgit v1.2.1 From 8968f9d3dc23d9a1821d97c6f11e72a59382e56c Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 13 Oct 2009 16:19:41 +0900 Subject: perf_event, x86, mce: Use TRACE_EVENT() for MCE logging This approach is the first baby step towards solving many of the structural problems the x86 MCE logging code is having today: - It has a private ring-buffer implementation that has a number of limitations and has been historically fragile and buggy. - It is using a quirky /dev/mcelog ioctl driven ABI that is MCE specific. /dev/mcelog is not part of any larger logging framework and hence has remained on the fringes for many years. - The MCE logging code is still very unclean partly due to its ABI limitations. Fields are being reused for multiple purposes, and the whole message structure is limited and x86 specific to begin with. All in one, the x86 tree would like to move away from this private implementation of an event logging facility to a broader framework. By using perf events we gain the following advantages: - Multiple user-space agents can access MCE events. We can have an mcelog daemon running but also a system-wide tracer capturing important events in flight-recorder mode. - Sampling support: the kernel and the user-space call-chain of MCE events can be stored and analyzed as well. This way actual patterns of bad behavior can be matched to precisely what kind of activity happened in the kernel (and/or in the app) around that moment in time. - Coupling with other hardware and software events: the PMU can track a number of other anomalies - monitoring software might chose to monitor those plus the MCE events as well - in one coherent stream of events. - Discovery of MCE sources - tracepoints are enumerated and tools can act upon the existence (or non-existence) of various channels of MCE information. - Filtering support: we just subscribe to and act upon the events we are interested in. Then even on a per event source basis there's in-kernel filter expressions available that can restrict the amount of data that hits the event channel. - Arbitrary deep per cpu buffering of events - we can buffer 32 entries or we can buffer as much as we want, as long as we have the RAM. - An NMI-safe ring-buffer implementation - mappable to user-space. - Built-in support for timestamping of events, PID markers, CPU markers, etc. - A rich ABI accessible over system call interface. Per cpu, per task and per workload monitoring of MCE events can be done this way. The ABI itself has a nice, meaningful structure. - Extensible ABI: new fields can be added without breaking tooling. New tracepoints can be added as the hardware side evolves. There's various parsers that can be used. - Lots of scheduling/buffering/batching modes of operandi for MCE events. poll() support. mmap() support. read() support. You name it. - Rich tooling support: even without any MCE specific extensions added the 'perf' tool today offers various views of MCE data: perf report, perf stat, perf trace can all be used to view logged MCE events and perhaps correlate them to certain user-space usage patterns. But it can be used directly as well, for user-space agents and policy action in mcelog, etc. With this we hope to achieve significant code cleanup and feature improvements in the MCE code, and we hope to be able to drop the /dev/mcelog facility in the end. This patch is just a plain dumb dump of mce_log() records to the tracepoints / perf events framework - a first proof of concept step. Signed-off-by: Hidetoshi Seto Cc: Huang Ying Cc: Andi Kleen LKML-Reference: <4AD42A0D.7050104@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b1598a9436d0..39caea3d8bc3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -46,6 +46,9 @@ #include "mce-internal.h" +#define CREATE_TRACE_POINTS +#include + int mce_disabled __read_mostly; #define MISC_MCELOG_MINOR 227 @@ -141,6 +144,9 @@ void mce_log(struct mce *mce) { unsigned next, entry; + /* Emit the trace record: */ + trace_mce_record(mce); + mce->finished = 0; wmb(); for (;;) { -- cgit v1.2.1 From 194ec34184869f0de1cf255c924fc5299e1b3d27 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 13 Oct 2009 16:33:50 -0400 Subject: function-graph/x86: Replace unbalanced ret with jmp The function graph tracer replaces the return address with a hook to trace the exit of the function call. This hook will finish by returning to the real location the function should return to. But the current implementation uses a ret to jump to the real return location. This causes a imbalance between calls and ret. That is the original function does a call, the ret goes to the handler and then the handler does a ret without a matching call. Although the function graph tracer itself still breaks the branch predictor by replacing the original ret, by using a second ret and causing an imbalance, it breaks the predictor even more. This patch replaces the ret with a jmp to keep the calls and ret balanced. I tested this on one box and it showed a 1.7% increase in performance. Another box only showed a small 0.3% increase. But no box that I tested this on showed a decrease in performance by making this change. Signed-off-by: Steven Rostedt Acked-by: Mathieu Desnoyers Cc: Frederic Weisbecker LKML-Reference: <20091013203425.042034383@goodmis.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 7 ++----- arch/x86/kernel/entry_64.S | 6 +++--- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c097e7d607c6..7d52e9da5e0c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1185,17 +1185,14 @@ END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl $0 pushl %eax - pushl %ecx pushl %edx movl %ebp, %eax call ftrace_return_to_handler - movl %eax, 0xc(%esp) + movl %eax, %ecx popl %edx - popl %ecx popl %eax - ret + jmp *%ecx #endif .section .rodata,"a" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b5c061f8f358..bd5bbddddf91 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -155,11 +155,11 @@ GLOBAL(return_to_handler) call ftrace_return_to_handler - movq %rax, 16(%rsp) + movq %rax, %rdi movq 8(%rsp), %rdx movq (%rsp), %rax - addq $16, %rsp - retq + addq $24, %rsp + jmp *%rdi #endif -- cgit v1.2.1 From 9844ab11c763bfed9f054c82366b19dcda66aca9 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 14 Oct 2009 00:07:03 +0400 Subject: x86, apic: Introduce the NOOP apic driver Introduce NOOP APIC driver. We should use it in case if apic was disabled due to hardware of software/firmware problems (including user requested to disable it case). The driver is attempting to catch any inappropriate apic operation call with warning issue. Also it is possible to use some apic operation like IPI calls, read/write without checking for apic presence which should make callers code easier. Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org Cc: macro@linux-mips.org LKML-Reference: <20091013201022.534682104@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/Makefile | 2 +- arch/x86/kernel/apic/apic_noop.c | 194 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/apic/apic_noop.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index da7b7b9f8bd8..565c1bfc507d 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,7 +2,7 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c new file mode 100644 index 000000000000..0b93ec2fde0a --- /dev/null +++ b/arch/x86/kernel/apic/apic_noop.c @@ -0,0 +1,194 @@ +/* + * NOOP APIC driver. + * + * Does almost nothing and should be substituted by a real apic driver via + * probe routine. + * + * Though in case if apic is disabled (for some reason) we try + * to not uglify the caller's code and allow to call (some) apic routines + * like self-ipi, etc... and issue a warning if an operation is not allowed + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +/* + * some operations should never be reached with + * noop apic if it's not turned off, this mostly + * means the caller forgot to disable apic (or + * check the apic presence) before doing a call + */ +static void warn_apic_enabled(void) +{ + WARN_ONCE((cpu_has_apic || !disable_apic), + "APIC: Called for NOOP operation with apic enabled\n"); +} + +/* + * To check operations but do not bloat source code + */ +#define NOOP_FUNC(func) func { warn_apic_enabled(); } +#define NOOP_FUNC_RET(func, ret) func { warn_apic_enabled(); return ret; } + +NOOP_FUNC(static void noop_init_apic_ldr(void)) +NOOP_FUNC(static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector)) +NOOP_FUNC(static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)) +NOOP_FUNC(static void noop_send_IPI_allbutself(int vector)) +NOOP_FUNC(static void noop_send_IPI_all(int vector)) +NOOP_FUNC(static void noop_send_IPI_self(int vector)) +NOOP_FUNC_RET(static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip), -1) +NOOP_FUNC(static void noop_apic_write(u32 reg, u32 v)) +NOOP_FUNC(void noop_apic_wait_icr_idle(void)) +NOOP_FUNC_RET(static u32 noop_safe_apic_wait_icr_idle(void), 0) +NOOP_FUNC_RET(static u64 noop_apic_icr_read(void), 0) +NOOP_FUNC(static void noop_apic_icr_write(u32 low, u32 id)) +NOOP_FUNC_RET(static physid_mask_t noop_ioapic_phys_id_map(physid_mask_t phys_map), phys_map) +NOOP_FUNC_RET(static int noop_cpu_to_logical_apicid(int cpu), 1) +NOOP_FUNC_RET(static int noop_default_phys_pkg_id(int cpuid_apic, int index_msb), 0) +NOOP_FUNC_RET(static unsigned int noop_get_apic_id(unsigned long x), 0) + +static int noop_probe(void) +{ + /* should not ever be enabled this way */ + return 0; +} + +static int noop_apic_id_registered(void) +{ + warn_apic_enabled(); + return physid_isset(read_apic_id(), phys_cpu_present_map); +} + +static const struct cpumask *noop_target_cpus(void) +{ + warn_apic_enabled(); + + /* only BSP here */ + return cpumask_of(0); +} + +static unsigned long noop_check_apicid_used(physid_mask_t bitmap, int apicid) +{ + warn_apic_enabled(); + return physid_isset(apicid, bitmap); +} + +static unsigned long noop_check_apicid_present(int bit) +{ + warn_apic_enabled(); + return physid_isset(bit, phys_cpu_present_map); +} + +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + warn_apic_enabled(); + if (cpu != 0) + pr_warning("APIC: Vector allocated for non-BSP cpu\n"); + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + +int noop_apicid_to_node(int logical_apicid) +{ + warn_apic_enabled(); + + /* we're always on node 0 */ + return 0; +} + +static u32 noop_apic_read(u32 reg) +{ + /* + * noop-read is always safe until we have + * non-disabled unit + */ + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + return 0; +} + +struct apic apic_noop = { + .name = "noop", + .probe = noop_probe, + .acpi_madt_oem_check = NULL, + + .apic_id_registered = noop_apic_id_registered, + + .irq_delivery_mode = dest_LowestPrio, + /* logical delivery broadcast to all CPUs: */ + .irq_dest_mode = 1, + + .target_cpus = noop_target_cpus, + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = noop_check_apicid_used, + .check_apicid_present = noop_check_apicid_present, + + .vector_allocation_domain = noop_vector_allocation_domain, + .init_apic_ldr = noop_init_apic_ldr, + + .ioapic_phys_id_map = noop_ioapic_phys_id_map, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .apicid_to_node = noop_apicid_to_node, + + .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = default_apicid_to_cpu_present, + + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + + .phys_pkg_id = noop_default_phys_pkg_id, + + .mps_oem_check = NULL, + + .get_apic_id = noop_get_apic_id, + .set_apic_id = NULL, + .apic_id_mask = 0x0F << 24, + + .cpu_mask_to_apicid = default_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + + .send_IPI_mask = noop_send_IPI_mask, + .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, + .send_IPI_allbutself = noop_send_IPI_allbutself, + .send_IPI_all = noop_send_IPI_all, + .send_IPI_self = noop_send_IPI_self, + + .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, + + /* should be safe */ + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + + .wait_for_init_deassert = NULL, + + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, + + .read = noop_apic_read, + .write = noop_apic_write, + .icr_read = noop_apic_icr_read, + .icr_write = noop_apic_icr_write, + .wait_icr_idle = noop_apic_wait_icr_idle, + .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, +}; -- cgit v1.2.1 From a933c61829509eb27083146dda392132baa0969a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 14 Oct 2009 00:07:04 +0400 Subject: x86, apic: Use apic noop driver In case if apic were disabled we may use the whole apic NOOP driver instead of sparse poking the some functions in apic driver. Also NOOP would catch any inappropriate apic operation calls (not just read/write). Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org Cc: macro@linux-mips.org LKML-Reference: <20091013201022.747817361@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 894aa97f0717..61a5628810da 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -241,28 +241,12 @@ static int modern_apic(void) } /* - * bare function to substitute write operation - * and it's _that_ fast :) - */ -static void native_apic_write_dummy(u32 reg, u32 v) -{ - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); -} - -static u32 native_apic_read_dummy(u32 reg) -{ - WARN_ON_ONCE((cpu_has_apic && !disable_apic)); - return 0; -} - -/* - * right after this call apic->write/read doesn't do anything - * note that there is no restore operation it works one way + * right after this call apic become NOOP driven + * so apic->write/read doesn't do anything */ void apic_disable(void) { - apic->read = native_apic_read_dummy; - apic->write = native_apic_write_dummy; + apic = &apic_noop; } void native_apic_wait_icr_idle(void) -- cgit v1.2.1 From 2626eb2b2fd958dc0f683126aa84e93b939699a1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 14 Oct 2009 00:07:05 +0400 Subject: x86, apic: Limit apic dumping, introduce new show_lapic= setup option In case if a system has a large number of cpus printing apics contents may consume a long time period. We limit such an output by 1 apic by default. But to have an ability to see all apics or some part of them we introduce "show_lapic" setup option which allow us to limit/unlimit the number of APICs being dumped. Example: apic=debug show_lapic=5, or apic=debug show_lapic=all Also move apic_verbosity checking upper that way so helper routines do not need to inspect it at all. Suggested-by: Yinghai Lu Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org Cc: macro@linux-mips.org LKML-Reference: <20091013201022.926793122@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 47 ++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index dc69f28489f5..8c718c93d079 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1599,9 +1599,6 @@ __apicdebuginit(void) print_IO_APIC(void) struct irq_desc *desc; unsigned int irq; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", @@ -1708,9 +1705,6 @@ __apicdebuginit(void) print_APIC_field(int base) { int i; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG); for (i = 0; i < 8; i++) @@ -1724,9 +1718,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy) unsigned int i, v, ver, maxlvt; u64 icr; - if (apic_verbosity == APIC_QUIET) - return; - printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); @@ -1824,13 +1815,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk("\n"); } -__apicdebuginit(void) print_all_local_APICs(void) +__apicdebuginit(void) print_local_APICs(int maxcpu) { int cpu; + if (!maxcpu) + return; + preempt_disable(); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + if (cpu >= maxcpu) + break; smp_call_function_single(cpu, print_local_APIC, NULL, 1); + } preempt_enable(); } @@ -1839,7 +1836,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) + if (!nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1866,21 +1863,41 @@ __apicdebuginit(void) print_PIC(void) printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } -__apicdebuginit(int) print_all_ICs(void) +static int __initdata show_lapic = 1; +static __init int setup_show_lapic(char *arg) +{ + int num = -1; + + if (strcmp(arg, "all") == 0) { + show_lapic = CONFIG_NR_CPUS; + } else { + get_option(&arg, &num); + if (num >= 0) + show_lapic = num; + } + + return 1; +} +__setup("show_lapic=", setup_show_lapic); + +__apicdebuginit(int) print_ICs(void) { + if (apic_verbosity == APIC_QUIET) + return 0; + print_PIC(); /* don't print out if apic is not there */ if (!cpu_has_apic && !apic_from_smp_config()) return 0; - print_all_local_APICs(); + print_local_APICs(show_lapic); print_IO_APIC(); return 0; } -fs_initcall(print_all_ICs); +fs_initcall(print_ICs); /* Where if anywhere is the i8259 connect in external int mode */ -- cgit v1.2.1 From 6c2c502910247d2820cb630e7b28fb6bdecdbf45 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 30 Sep 2009 11:02:59 -0500 Subject: x86: SGI UV: Fix irq affinity for hub based interrupts This patch fixes handling of uv hub irq affinity. IRQs with ALL or NODE affinity can be routed to cpus other than their originally assigned cpu. Those with CPU affinity cannot be rerouted. Signed-off-by: Dimitri Sivanich LKML-Reference: <20090930160259.GA7822@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 49 ++++++++++++++-- arch/x86/kernel/uv_irq.c | 128 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 165 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8c718c93d079..bb52e7f6e953 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3731,9 +3731,10 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) * on the specified blade to allow the sending of MSIs to the specified CPU. */ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset) + unsigned long mmr_offset, int restrict) { const struct cpumask *eligible_cpu = cpumask_of(cpu); + struct irq_desc *desc = irq_to_desc(irq); struct irq_cfg *cfg; int mmr_pnode; unsigned long mmr_value; @@ -3749,6 +3750,11 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, if (err != 0) return err; + if (restrict == UV_AFFINITY_CPU) + desc->status |= IRQ_NO_BALANCING; + else + desc->status |= IRQ_MOVE_PCNTXT; + spin_lock_irqsave(&vector_lock, flags); set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, irq_name); @@ -3777,11 +3783,10 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, * Disable the specified MMR located on the specified blade so that MSIs are * longer allowed to be sent. */ -void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) +void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) { unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); @@ -3789,9 +3794,45 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) entry = (struct uv_IO_APIC_route_entry *)&mmr_value; entry->mask = 1; - mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); } + +int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + unsigned int dest; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long mmr_offset; + unsigned mmr_pnode; + + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) + return -1; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = dest; + + /* Get previously stored MMR and pnode of hub sourcing interrupts */ + if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) + return -1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return 0; +} #endif /* CONFIG_X86_64 */ int __init io_apic_get_redir_entries (int ioapic) diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index aeef529917e4..9a83775ab0f3 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -9,10 +9,22 @@ */ #include +#include #include #include #include +#include + +/* MMR offset and pnode of hub sourcing interrupts for a given irq */ +struct uv_irq_2_mmr_pnode{ + struct rb_node list; + unsigned long offset; + int pnode; + int irq; +}; +static spinlock_t uv_irq_lock; +static struct rb_root uv_irq_root; static void uv_noop(unsigned int irq) { @@ -39,25 +51,106 @@ struct irq_chip uv_irq_chip = { .unmask = uv_noop, .eoi = uv_ack_apic, .end = uv_noop, + .set_affinity = uv_set_irq_affinity, }; +/* + * Add offset and pnode information of the hub sourcing interrupts to the + * rb tree for a specific irq. + */ +static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade) +{ + struct rb_node **link = &uv_irq_root.rb_node; + struct rb_node *parent = NULL; + struct uv_irq_2_mmr_pnode *n; + struct uv_irq_2_mmr_pnode *e; + unsigned long irqflags; + + n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL, + uv_blade_to_memory_nid(blade)); + if (!n) + return -ENOMEM; + + n->irq = irq; + n->offset = offset; + n->pnode = uv_blade_to_pnode(blade); + spin_lock_irqsave(&uv_irq_lock, irqflags); + /* Find the right place in the rbtree: */ + while (*link) { + parent = *link; + e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list); + + if (unlikely(irq == e->irq)) { + /* irq entry exists */ + e->pnode = uv_blade_to_pnode(blade); + e->offset = offset; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + kfree(n); + return 0; + } + + if (irq < e->irq) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + /* Insert the node into the rbtree. */ + rb_link_node(&n->list, parent, link); + rb_insert_color(&n->list, &uv_irq_root); + + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; +} + +/* Retrieve offset and pnode information from the rb tree for a specific irq */ +int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) +{ + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + + if (e->irq == irq) { + *offset = e->offset; + *pnode = e->pnode; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; + } + + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return -1; +} + /* * Set up a mapping of an available irq and vector, and enable the specified * MMR that defines the MSI that is to be sent to the specified CPU when an * interrupt is raised. */ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, - unsigned long mmr_offset) + unsigned long mmr_offset, int restrict) { - int irq; - int ret; + int irq, ret; + + irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade)); - irq = create_irq(); if (irq <= 0) return -EBUSY; - ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); - if (ret != irq) + ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, + restrict); + if (ret == irq) + uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); + else destroy_irq(irq); return ret; @@ -71,9 +164,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq); * * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). */ -void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) +void uv_teardown_irq(unsigned int irq) { - arch_disable_uv_irq(mmr_blade, mmr_offset); + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + if (e->irq == irq) { + arch_disable_uv_irq(e->pnode, e->offset); + rb_erase(n, &uv_irq_root); + kfree(e); + break; + } + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); destroy_irq(irq); } EXPORT_SYMBOL_GPL(uv_teardown_irq); -- cgit v1.2.1 From 9338ad6ffb70eca97f335d93c54943828c8b209e Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 13 Oct 2009 15:32:36 -0500 Subject: x86, apic: Move SGI UV functionality out of generic IO-APIC code Move UV specific functionality out of the generic IO-APIC code. Signed-off-by: Dimitri Sivanich LKML-Reference: <20091013203236.GD20543@sgi.com> [ Cleaned up the code some more in their new places. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 140 ++--------------------------------------- arch/x86/kernel/uv_irq.c | 123 ++++++++++++++++++++++++++++++++++-- 2 files changed, 122 insertions(+), 141 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index bb52e7f6e953..ce16b65cfdcc 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -60,8 +60,6 @@ #include #include #include -#include -#include #include @@ -140,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) return pin; } -/* - * This is performance-critical, we want to do it O(1) - * - * Most irqs are mapped 1:1 with pins. - */ -struct irq_cfg { - struct irq_pin_list *irq_2_pin; - cpumask_var_t domain; - cpumask_var_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg irq_cfgx[] = { @@ -209,7 +193,7 @@ int __init arch_early_irq_init(void) } #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { struct irq_cfg *cfg = NULL; struct irq_desc *desc; @@ -361,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) /* end for move_irq_desc */ #else -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { return irq < nr_irqs ? irq_cfgx + irq : NULL; } @@ -1237,8 +1221,7 @@ next: return err; } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; @@ -2245,7 +2228,7 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) +void send_cleanup_vector(struct irq_cfg *cfg) { cpumask_var_t cleanup_mask; @@ -2289,15 +2272,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq } } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - /* * Either sets desc->affinity to a valid value, and returns * ->cpu_mask_to_apicid of that, or returns BAD_APICID and * leaves desc->affinity untouched. */ -static unsigned int +unsigned int set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) { struct irq_cfg *cfg; @@ -3725,116 +3705,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ -#ifdef CONFIG_X86_UV -/* - * Re-target the irq to the specified CPU and enable the specified MMR located - * on the specified blade to allow the sending of MSIs to the specified CPU. - */ -int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset, int restrict) -{ - const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg; - int mmr_pnode; - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - unsigned long flags; - int err; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - cfg = irq_cfg(irq); - - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - if (restrict == UV_AFFINITY_CPU) - desc->status |= IRQ_NO_BALANCING; - else - desc->status |= IRQ_MOVE_PCNTXT; - - spin_lock_irqsave(&vector_lock, flags); - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - spin_unlock_irqrestore(&vector_lock, flags); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; -} - -/* - * Disable the specified MMR located on the specified blade so that MSIs are - * longer allowed to be sent. - */ -void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) -{ - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->mask = 1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); -} - -int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irq_cfg *cfg = desc->chip_data; - unsigned int dest; - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - unsigned long mmr_offset; - unsigned mmr_pnode; - - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) - return -1; - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = dest; - - /* Get previously stored MMR and pnode of hub sourcing interrupts */ - if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) - return -1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return 0; -} -#endif /* CONFIG_X86_64 */ - int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 9a83775ab0f3..61d805df4c91 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -18,13 +18,16 @@ /* MMR offset and pnode of hub sourcing interrupts for a given irq */ struct uv_irq_2_mmr_pnode{ - struct rb_node list; - unsigned long offset; - int pnode; - int irq; + struct rb_node list; + unsigned long offset; + int pnode; + int irq; }; -static spinlock_t uv_irq_lock; -static struct rb_root uv_irq_root; + +static spinlock_t uv_irq_lock; +static struct rb_root uv_irq_root; + +static int uv_set_irq_affinity(unsigned int, const struct cpumask *); static void uv_noop(unsigned int irq) { @@ -131,6 +134,114 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) return -1; } +/* + * Re-target the irq to the specified CPU and enable the specified MMR located + * on the specified blade to allow the sending of MSIs to the specified CPU. + */ +static int +arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, + unsigned long mmr_offset, int restrict) +{ + const struct cpumask *eligible_cpu = cpumask_of(cpu); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg; + int mmr_pnode; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + int err; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + cfg = irq_cfg(irq); + + err = assign_irq_vector(irq, cfg, eligible_cpu); + if (err != 0) + return err; + + if (restrict == UV_AFFINITY_CPU) + desc->status |= IRQ_NO_BALANCING; + else + desc->status |= IRQ_MOVE_PCNTXT; + + set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_name); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return irq; +} + +/* + * Disable the specified MMR located on the specified blade so that MSIs are + * longer allowed to be sent. + */ +static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->mask = 1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +} + +static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + unsigned int dest; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long mmr_offset; + unsigned mmr_pnode; + + dest = set_desc_affinity(desc, mask); + if (dest == BAD_APICID) + return -1; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = dest; + + /* Get previously stored MMR and pnode of hub sourcing interrupts */ + if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) + return -1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return 0; +} + /* * Set up a mapping of an available irq and vector, and enable the specified * MMR that defines the MSI that is to be sent to the specified CPU when an -- cgit v1.2.1 From c44fc770845163f8d9e573f37f92a7b7a7ade14e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 19 Sep 2009 06:50:42 +0200 Subject: tracing: Move syscalls metadata handling from arch to core Most of the syscalls metadata processing is done from arch. But these operations are mostly generic accross archs. Especially now that we have a common variable name that expresses the number of syscalls supported by an arch: NR_syscalls, the only remaining bits that need to reside in arch is the syscall nr to addr translation. v2: Compare syscalls symbols only after the "sys" prefix so that we avoid spurious mismatches with archs that have syscalls wrappers, in which case syscalls symbols have "SyS" prefixed aliases. (Reported by: Heiko Carstens) Signed-off-by: Frederic Weisbecker Acked-by: Heiko Carstens Cc: Ingo Molnar Cc: Steven Rostedt Cc: Li Zefan Cc: Masami Hiramatsu Cc: Jason Baron Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Paul Mundt --- arch/x86/kernel/ftrace.c | 76 ++---------------------------------------------- 1 file changed, 2 insertions(+), 74 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 25e6f5fc4b1e..5a1b9758fd62 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -470,82 +470,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, #ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; extern unsigned long *sys_call_table; -static struct syscall_metadata **syscalls_metadata; - -static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) -{ - struct syscall_metadata *start; - struct syscall_metadata *stop; - char str[KSYM_SYMBOL_LEN]; - - - start = (struct syscall_metadata *)__start_syscalls_metadata; - stop = (struct syscall_metadata *)__stop_syscalls_metadata; - kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); - - for ( ; start < stop; start++) { - if (start->name && !strcmp(start->name, str)) - return start; - } - return NULL; -} - -struct syscall_metadata *syscall_nr_to_meta(int nr) -{ - if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) - return NULL; - - return syscalls_metadata[nr]; -} - -int syscall_name_to_nr(char *name) -{ - int i; - - if (!syscalls_metadata) - return -1; - - for (i = 0; i < NR_syscalls; i++) { - if (syscalls_metadata[i]) { - if (!strcmp(syscalls_metadata[i]->name, name)) - return i; - } - } - return -1; -} - -void set_syscall_enter_id(int num, int id) -{ - syscalls_metadata[num]->enter_id = id; -} - -void set_syscall_exit_id(int num, int id) +unsigned long __init arch_syscall_addr(int nr) { - syscalls_metadata[num]->exit_id = id; -} - -static int __init arch_init_ftrace_syscalls(void) -{ - int i; - struct syscall_metadata *meta; - unsigned long **psys_syscall_table = &sys_call_table; - - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return -ENOMEM; - } - - for (i = 0; i < NR_syscalls; i++) { - meta = find_syscall_meta(psys_syscall_table[i]); - syscalls_metadata[i] = meta; - } - return 0; + return (unsigned long)(&sys_call_table)[nr]; } -arch_initcall(arch_init_ftrace_syscalls); #endif -- cgit v1.2.1 From 89ccf465abe6b20d804a63ae20307970c441369d Mon Sep 17 00:00:00 2001 From: Li Hong Date: Wed, 14 Oct 2009 18:50:39 +0800 Subject: x86, perf_event: Rename 'performance counter interrupt' In 'cdd6c482c9ff9c55475ee7392ec8f672eddb7be6', we renamed Performance Counters -> Performance Events. The name showed up in /proc/interrupts also needs a change. I use PMI (Performance monitoring interrupt) here, since it is the official name used in Intel's documents. Signed-off-by: Li Hong Cc: Andrew Morton Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20091014105039.GA22670@uhli> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 391206199515..86fb2c8e065a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -63,10 +63,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "%*s: ", prec, "CNT"); + seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); -- cgit v1.2.1 From ac06ea2cd06291e63951b51dd7c9a23e6a1f2683 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 10 Oct 2009 09:35:48 +0200 Subject: x86: Remove BKL from microcode cycle_lock_kernel() in microcode_open() is a worthless exercise as there is nothing to wait for. Remove it. Signed-off-by: Thomas Gleixner LKML-Reference: <20091010153349.196074920@linutronix.de> --- arch/x86/kernel/microcode_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 378e9a8f1bf8..2bcad3926edb 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -73,7 +73,6 @@ #include #include #include -#include #include #include #include @@ -201,7 +200,6 @@ static int do_microcode_update(const void __user *buf, size_t size) static int microcode_open(struct inode *unused1, struct file *unused2) { - cycle_kernel_lock(); return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } -- cgit v1.2.1 From 05d86412eab6a18cf57697474cc4f8fbfcd6936f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 9 Oct 2009 19:02:20 +0200 Subject: x86: Remove BKL from apm_32 The lock/unlock kernel pair in do_open() got there with the BKL push down and protects nothing. Remove it. Replace the lock/unlock kernel in the ioctl code with a mutex to protect standbys_pending and suspends_pending. Signed-off-by: Thomas Gleixner LKML-Reference: <20091010153349.365236337@linutronix.de> --- arch/x86/kernel/apm_32.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 151ace69a5aa..b5b6b23bce53 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -204,7 +204,6 @@ #include #include -#include #include #include #include @@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); +static DEFINE_MUTEX(apm_mutex); /* * Set up a segment that references the real mode segment 0x40 @@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) return -EPERM; switch (cmd) { case APM_IOC_STANDBY: - lock_kernel(); + mutex_lock(&apm_mutex); if (as->standbys_read > 0) { as->standbys_read--; as->standbys_pending--; @@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) queue_event(APM_USER_STANDBY, as); if (standbys_pending <= 0) standby(); - unlock_kernel(); + mutex_unlock(&apm_mutex); break; case APM_IOC_SUSPEND: - lock_kernel(); + mutex_lock(&apm_mutex); if (as->suspends_read > 0) { as->suspends_read--; as->suspends_pending--; @@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg) queue_event(APM_USER_SUSPEND, as); if (suspends_pending <= 0) { ret = suspend(1); + mutex_unlock(&apm_mutex); } else { as->suspend_wait = 1; + mutex_unlock(&apm_mutex); wait_event_interruptible(apm_suspend_waitqueue, as->suspend_wait == 0); ret = as->suspend_result; } - unlock_kernel(); return ret; default: return -ENOTTY; @@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp) { struct apm_user *as; - lock_kernel(); as = kmalloc(sizeof(*as), GFP_KERNEL); if (as == NULL) { printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", sizeof(*as)); - unlock_kernel(); return -ENOMEM; } as->magic = APM_BIOS_MAGIC; @@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp) user_list = as; spin_unlock(&user_list_lock); filp->private_data = as; - unlock_kernel(); return 0; } -- cgit v1.2.1 From e47938b1faaf9e9041ae842a878901001ce20ea1 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 14 Oct 2009 09:16:30 -0500 Subject: x86: UV RTC: Fix early expiry handling Tune/fix early timer expiry handling and return correct early timeout value for set_next_event. Signed-off-by: Dimitri Sivanich LKML-Reference: <20091014141630.GB11048@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 583f11d5c480..ec14889628e0 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires) /* Initialize comparator value */ uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); - return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); + if (uv_read_rtc(NULL) <= expires) + return 0; + + return !uv_intr_pending(pnode); } /* @@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) next_cpu = head->next_cpu; *t = expires; + /* Will this one be next to go off? */ if (next_cpu < 0 || bcpu == next_cpu || expires < head->cpu[next_cpu].expires) { @@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) *t = ULLONG_MAX; uv_rtc_find_next_timer(head, pnode); spin_unlock_irqrestore(&head->lock, flags); - return 1; + return -ETIME; } } @@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires) * * Returns 1 if this timer was pending. */ -static int uv_rtc_unset_timer(int cpu) +static int uv_rtc_unset_timer(int cpu, int force) { int pnode = uv_cpu_to_pnode(cpu); int bid = uv_cpu_to_blade_id(cpu); @@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu) spin_lock_irqsave(&head->lock, flags); - if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) + if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force) rc = 1; - *t = ULLONG_MAX; - - /* Was the hardware setup for this timer? */ - if (head->next_cpu == bcpu) - uv_rtc_find_next_timer(head, pnode); + if (rc) { + *t = ULLONG_MAX; + /* Was the hardware setup for this timer? */ + if (head->next_cpu == bcpu) + uv_rtc_find_next_timer(head, pnode); + } spin_unlock_irqrestore(&head->lock, flags); @@ -310,20 +315,20 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode, break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: - uv_rtc_unset_timer(ced_cpu); + uv_rtc_unset_timer(ced_cpu, 1); break; } } static void uv_rtc_interrupt(void) { - struct clock_event_device *ced = &__get_cpu_var(cpu_ced); int cpu = smp_processor_id(); + struct clock_event_device *ced = &per_cpu(cpu_ced, cpu); if (!ced || !ced->event_handler) return; - if (uv_rtc_unset_timer(cpu) != 1) + if (uv_rtc_unset_timer(cpu, 0) != 1) return; ced->event_handler(ced); -- cgit v1.2.1 From 8c28de4d011f37b2893ecfcec9a985c0e9bd786f Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 14 Oct 2009 09:18:48 -0500 Subject: x86: UV RTC: Add clocksource only boot option Add clocksource only boot option for UV RTC. Signed-off-by: Dimitri Sivanich LKML-Reference: <20091014141848.GC11048@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index ec14889628e0..c6324ad7c0d9 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -75,6 +75,7 @@ struct uv_rtc_timer_head { static struct uv_rtc_timer_head **blade_info __read_mostly; static int uv_rtc_enable; +static int uv_rtc_evt_enable; /* * Hardware interface routines @@ -342,6 +343,14 @@ static int __init uv_enable_rtc(char *str) } __setup("uvrtc", uv_enable_rtc); +static int __init uv_enable_evt_rtc(char *str) +{ + uv_rtc_evt_enable = 1; + + return 1; +} +__setup("uvrtcevt", uv_enable_evt_rtc); + static __init void uv_rtc_register_clockevents(struct work_struct *dummy) { struct clock_event_device *ced = &__get_cpu_var(cpu_ced); @@ -358,16 +367,20 @@ static __init int uv_rtc_setup_clock(void) if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) return -ENODEV; - generic_interrupt_extension = uv_rtc_interrupt; - clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, clocksource_uv.shift); rc = clocksource_register(&clocksource_uv); - if (rc) { - generic_interrupt_extension = NULL; + if (rc) + printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); + else + printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n", + sn_rtc_cycles_per_second/(unsigned long)1E6); + + if (rc || !uv_rtc_evt_enable) return rc; - } + + generic_interrupt_extension = uv_rtc_interrupt; /* Setup and register clockevents */ rc = uv_rtc_allocate_timers(); -- cgit v1.2.1 From d5991ff297ad2f7e2698eefcd8269df5ecec150f Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 14 Oct 2009 09:21:03 -0500 Subject: x86: UV RTC: Clean up error handling Cleanup error handling in uv_rtc_setup_clock. Signed-off-by: Dimitri Sivanich LKML-Reference: <20091014142103.GD11048@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index c6324ad7c0d9..255645084534 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -380,15 +380,12 @@ static __init int uv_rtc_setup_clock(void) if (rc || !uv_rtc_evt_enable) return rc; - generic_interrupt_extension = uv_rtc_interrupt; - /* Setup and register clockevents */ rc = uv_rtc_allocate_timers(); - if (rc) { - clocksource_unregister(&clocksource_uv); - generic_interrupt_extension = NULL; - return rc; - } + if (rc) + goto error; + + generic_interrupt_extension = uv_rtc_interrupt; clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, NSEC_PER_SEC, clock_event_device_uv.shift); @@ -401,11 +398,19 @@ static __init int uv_rtc_setup_clock(void) rc = schedule_on_each_cpu(uv_rtc_register_clockevents); if (rc) { - clocksource_unregister(&clocksource_uv); generic_interrupt_extension = NULL; uv_rtc_deallocate_timers(); + goto error; } + printk(KERN_INFO "UV RTC clockevents registered\n"); + + return 0; + +error: + clocksource_unregister(&clocksource_uv); + printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc); + return rc; } arch_initcall(uv_rtc_setup_clock); -- cgit v1.2.1 From 4a4de9c7d7111ce4caf422b856756125d8304f9d Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 14 Oct 2009 09:22:57 -0500 Subject: x86: UV RTC: Rename generic_interrupt to x86_platform_ipi Signed-off-by: Dimitri Sivanich LKML-Reference: <20091014142257.GE11048@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/irq.c | 20 ++++++++++---------- arch/x86/kernel/irqinit.c | 4 ++-- arch/x86/kernel/uv_time.c | 10 +++++----- 4 files changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b5c061f8f358..6714432ef381 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -969,8 +969,8 @@ apicinterrupt UV_BAU_MESSAGE \ #endif apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt GENERIC_INTERRUPT_VECTOR \ - generic_interrupt smp_generic_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR \ + x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 391206199515..9375dce39f5f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -18,7 +18,7 @@ atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ -void (*generic_interrupt_extension)(void) = NULL; +void (*x86_platform_ipi_callback)(void) = NULL; /* * 'what should we do if we get a hw irq event on an illegal vector'. @@ -72,10 +72,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); seq_printf(p, " Performance pending work\n"); #endif - if (generic_interrupt_extension) { + if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_printf(p, " Platform interrupts\n"); } #ifdef CONFIG_SMP @@ -187,8 +187,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_pending_irqs; #endif - if (generic_interrupt_extension) - sum += irq_stats(cpu)->generic_irqs; + if (x86_platform_ipi_callback) + sum += irq_stats(cpu)->x86_platform_ipis; #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -252,9 +252,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) } /* - * Handler for GENERIC_INTERRUPT_VECTOR. + * Handler for X86_PLATFORM_IPI_VECTOR. */ -void smp_generic_interrupt(struct pt_regs *regs) +void smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -264,10 +264,10 @@ void smp_generic_interrupt(struct pt_regs *regs) irq_enter(); - inc_irq_stat(generic_irqs); + inc_irq_stat(x86_platform_ipis); - if (generic_interrupt_extension) - generic_interrupt_extension(); + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); run_local_timers(); irq_exit(); diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 40f30773fb29..d5932226614f 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -200,8 +200,8 @@ static void __init apic_intr_init(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI for X86 platform specific use */ + alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 255645084534..3da7b1d8bfd3 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -91,7 +91,7 @@ static void uv_rtc_send_IPI(int cpu) pnode = uv_apicid_to_pnode(apicid); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | - (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); + (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } @@ -116,7 +116,7 @@ static int uv_setup_intr(int cpu, u64 expires) uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, UVH_EVENT_OCCURRED0_RTC1_MASK); - val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | + val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); /* Set configuration */ @@ -364,7 +364,7 @@ static __init int uv_rtc_setup_clock(void) { int rc; - if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) + if (!uv_rtc_enable || !is_uv_system() || x86_platform_ipi_callback) return -ENODEV; clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, @@ -385,7 +385,7 @@ static __init int uv_rtc_setup_clock(void) if (rc) goto error; - generic_interrupt_extension = uv_rtc_interrupt; + x86_platform_ipi_callback = uv_rtc_interrupt; clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, NSEC_PER_SEC, clock_event_device_uv.shift); @@ -398,7 +398,7 @@ static __init int uv_rtc_setup_clock(void) rc = schedule_on_each_cpu(uv_rtc_register_clockevents); if (rc) { - generic_interrupt_extension = NULL; + x86_platform_ipi_callback = NULL; uv_rtc_deallocate_timers(); goto error; } -- cgit v1.2.1 From e9a63a4e559fbdc522072281d05e6b13c1022f4b Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 14 Oct 2009 14:16:38 -0700 Subject: x86: linker script syntax nits The linker scripts grew some use of weirdly wrong linker script syntax. It happens to work, but it's not what the syntax is documented to be. Clean it up to use the official syntax. Signed-off-by: Roland McGrath CC: Ian Lance Taylor --- arch/x86/kernel/acpi/realmode/wakeup.lds.S | 4 ++-- arch/x86/kernel/vmlinux.lds.S | 17 ++++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 7da00b799cda..0e50e1e5c573 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -56,6 +56,6 @@ SECTIONS /DISCARD/ : { *(.note*) } - - . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); } + +ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 92929fb3f9fa..8d6001ad8d8d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -305,8 +305,8 @@ SECTIONS #ifdef CONFIG_X86_32 -. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #else /* * Per-cpu symbols which need to be offset from __per_cpu_load @@ -319,12 +319,12 @@ INIT_PER_CPU(irq_stack_union); /* * Build-time check on the image size: */ -. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); +ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ @@ -332,7 +332,6 @@ INIT_PER_CPU(irq_stack_union); #ifdef CONFIG_KEXEC #include -. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big"); +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big"); #endif - -- cgit v1.2.1 From db8590f5043f3436a65b24155a3a7af2604df876 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 Oct 2009 08:08:12 +0200 Subject: Revert "x86: linker script syntax nits" This reverts commit e9a63a4e559fbdc522072281d05e6b13c1022f4b. This breaks older binutils, where sink-less asserts are broken. See this commit for further details: d2ba8b2: x86: Fix assert syntax in vmlinux.lds.S Acked-by: "H. Peter Anvin" Acked-by: Sam Ravnborg Cc: Linus Torvalds LKML-Reference: <4AD6523D.5030909@zytor.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/realmode/wakeup.lds.S | 4 ++-- arch/x86/kernel/vmlinux.lds.S | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 0e50e1e5c573..7da00b799cda 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -56,6 +56,6 @@ SECTIONS /DISCARD/ : { *(.note*) } -} -ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); + . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); +} diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8d6001ad8d8d..92929fb3f9fa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -305,8 +305,8 @@ SECTIONS #ifdef CONFIG_X86_32 -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); +. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #else /* * Per-cpu symbols which need to be offset from __per_cpu_load @@ -319,12 +319,12 @@ INIT_PER_CPU(irq_stack_union); /* * Build-time check on the image size: */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); +. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ @@ -332,6 +332,7 @@ ASSERT((per_cpu__irq_stack_union == 0), #ifdef CONFIG_KEXEC #include -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big"); +. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big"); #endif + -- cgit v1.2.1 From f88f2b4fdb1e098433ad2b005b6f7353f7268ce1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 15 Oct 2009 19:04:16 +0400 Subject: x86: apic: Allow noop operations to be called almost at any time As only apic noop is used we allow to use almost any operation caller wants (and which of them noop driver supports of course). Initially it was reported by Ingo Molnar that apic noop issue a warning for pkg id (which is actually false positive and should be eliminated). So we save checking (and warning issue) for read/write operations while allow any other ops to be freely used. Also: - fix noop_cpu_to_logical_apicid, it should be 0. - rename noop_default_phys_pkg_id to noop_phys_pkg_id (we use default_ prefix for more general routines in apic subsystem). Reported-by: Ingo Molnar Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu Cc: Maciej W. Rozycki LKML-Reference: <20091015150416.GC5331@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 + arch/x86/kernel/apic/apic_noop.c | 105 +++++++++++++++++++++------------------ 2 files changed, 59 insertions(+), 47 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 61a5628810da..dce93d4b0eaf 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -246,6 +246,7 @@ static int modern_apic(void) */ void apic_disable(void) { + pr_info("APIC: switched to apic NOOP\n"); apic = &apic_noop; } diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 0b93ec2fde0a..9ab6ffb313ac 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -6,7 +6,7 @@ * * Though in case if apic is disabled (for some reason) we try * to not uglify the caller's code and allow to call (some) apic routines - * like self-ipi, etc... and issue a warning if an operation is not allowed + * like self-ipi, etc... */ #include @@ -30,76 +30,88 @@ #include #include -/* - * some operations should never be reached with - * noop apic if it's not turned off, this mostly - * means the caller forgot to disable apic (or - * check the apic presence) before doing a call - */ -static void warn_apic_enabled(void) +static void noop_init_apic_ldr(void) { } +static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_allbutself(int vector) { } +static void noop_send_IPI_all(int vector) { } +static void noop_send_IPI_self(int vector) { } +static void noop_apic_wait_icr_idle(void) { } +static void noop_apic_icr_write(u32 low, u32 id) { } + +static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) { - WARN_ONCE((cpu_has_apic || !disable_apic), - "APIC: Called for NOOP operation with apic enabled\n"); + return -1; } -/* - * To check operations but do not bloat source code - */ -#define NOOP_FUNC(func) func { warn_apic_enabled(); } -#define NOOP_FUNC_RET(func, ret) func { warn_apic_enabled(); return ret; } - -NOOP_FUNC(static void noop_init_apic_ldr(void)) -NOOP_FUNC(static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector)) -NOOP_FUNC(static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)) -NOOP_FUNC(static void noop_send_IPI_allbutself(int vector)) -NOOP_FUNC(static void noop_send_IPI_all(int vector)) -NOOP_FUNC(static void noop_send_IPI_self(int vector)) -NOOP_FUNC_RET(static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip), -1) -NOOP_FUNC(static void noop_apic_write(u32 reg, u32 v)) -NOOP_FUNC(void noop_apic_wait_icr_idle(void)) -NOOP_FUNC_RET(static u32 noop_safe_apic_wait_icr_idle(void), 0) -NOOP_FUNC_RET(static u64 noop_apic_icr_read(void), 0) -NOOP_FUNC(static void noop_apic_icr_write(u32 low, u32 id)) -NOOP_FUNC_RET(static physid_mask_t noop_ioapic_phys_id_map(physid_mask_t phys_map), phys_map) -NOOP_FUNC_RET(static int noop_cpu_to_logical_apicid(int cpu), 1) -NOOP_FUNC_RET(static int noop_default_phys_pkg_id(int cpuid_apic, int index_msb), 0) -NOOP_FUNC_RET(static unsigned int noop_get_apic_id(unsigned long x), 0) +static u32 noop_safe_apic_wait_icr_idle(void) +{ + return 0; +} + +static u64 noop_apic_icr_read(void) +{ + return 0; +} + +static physid_mask_t noop_ioapic_phys_id_map(physid_mask_t phys_map) +{ + return phys_map; +} + +static int noop_cpu_to_logical_apicid(int cpu) +{ + return 0; +} + +static int noop_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return 0; +} + +static unsigned int noop_get_apic_id(unsigned long x) +{ + return 0; +} static int noop_probe(void) { - /* should not ever be enabled this way */ + /* + * NOOP apic should not ever be + * enabled via probe routine + */ return 0; } static int noop_apic_id_registered(void) { - warn_apic_enabled(); - return physid_isset(read_apic_id(), phys_cpu_present_map); + /* + * if we would be really "pedantic" + * we should pass read_apic_id() here + * but since NOOP suppose APIC ID = 0 + * lets save a few cycles + */ + return physid_isset(0, phys_cpu_present_map); } static const struct cpumask *noop_target_cpus(void) { - warn_apic_enabled(); - /* only BSP here */ return cpumask_of(0); } static unsigned long noop_check_apicid_used(physid_mask_t bitmap, int apicid) { - warn_apic_enabled(); return physid_isset(apicid, bitmap); } static unsigned long noop_check_apicid_present(int bit) { - warn_apic_enabled(); return physid_isset(bit, phys_cpu_present_map); } static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) { - warn_apic_enabled(); if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); cpumask_clear(retmask); @@ -108,22 +120,21 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) int noop_apicid_to_node(int logical_apicid) { - warn_apic_enabled(); - /* we're always on node 0 */ return 0; } static u32 noop_apic_read(u32 reg) { - /* - * noop-read is always safe until we have - * non-disabled unit - */ WARN_ON_ONCE((cpu_has_apic && !disable_apic)); return 0; } +static void noop_apic_write(u32 reg, u32 v) +{ + WARN_ON_ONCE((cpu_has_apic || !disable_apic)); +} + struct apic apic_noop = { .name = "noop", .probe = noop_probe, @@ -157,7 +168,7 @@ struct apic apic_noop = { .check_phys_apicid_present = default_check_phys_apicid_present, .enable_apic_mode = NULL, - .phys_pkg_id = noop_default_phys_pkg_id, + .phys_pkg_id = noop_phys_pkg_id, .mps_oem_check = NULL, -- cgit v1.2.1 From a5912f6b3e20c137172460e6d4dd180866c00963 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 16 Oct 2009 07:18:46 +0200 Subject: x86: Document linker script ASSERT() quirk Older binutils breaks if ASSERT() is used without a sink for the output. For example 2.14.90.0.6 is known to be broken, the link fails with: LD .tmp_vmlinux1 ld:arch/x86/kernel/vmlinux.lds:678: parse error Document this quirk in all three files that use it. See: http://marc.info/?l=linux-kbuild&m=124930110427870&w=2 See[2]: d2ba8b2 ("x86: Fix assert syntax in vmlinux.lds.S") Cc: Linus Torvalds Cc: Roland McGrath Cc: "H. Peter Anvin" Cc: Sam Ravnborg LKML-Reference: <4AD6523D.5030909@zytor.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/realmode/wakeup.lds.S | 3 +++ arch/x86/kernel/vmlinux.lds.S | 3 +++ 2 files changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 7da00b799cda..060fff8f5c5b 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -57,5 +57,8 @@ SECTIONS *(.note*) } + /* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 92929fb3f9fa..3c68fe2d46cf 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -305,6 +305,9 @@ SECTIONS #ifdef CONFIG_X86_32 +/* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); #else -- cgit v1.2.1 From 036ed8ba61b72c19dc5759446d4fe0844aa88255 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Thu, 15 Oct 2009 17:40:00 -0500 Subject: x86, UV: Fix information in __uv_hub_info structure A few parts of the uv_hub_info structure are initialized incorrectly. - n_val is being loaded with m_val. - gpa_mask is initialized with a bytes instead of an unsigned long. - Handle the case where none of the alias registers are used. Lastly I converted the bau over to using the uv_hub_info->m_val which is the correct value. Without this patch, booting a large configuration hits a problem where the upper bits of the gnode affect the pnode and the bau will not operate. Signed-off-by: Robin Holt Acked-by: Jack Steiner Cc: Cliff Whickman Cc: stable@kernel.org LKML-Reference: <20091015224946.396355000@alcatraz.americas.sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 8 ++++---- arch/x86/kernel/tlb_uv.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f5f5886a6b53..326c25477d3d 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -352,14 +352,14 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { alias.v = uv_read_local_mmr(redir_addrs[i].alias); - if (alias.s.base == 0) { + if (alias.s.enable && alias.s.base == 0) { *size = (1UL << alias.s.m_alias); redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; return; } } - BUG(); + *base = *size = 0; } enum map_type {map_wb, map_uc}; @@ -619,12 +619,12 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; - uv_cpu_hub_info(cpu)->n_val = m_val; + uv_cpu_hub_info(cpu)->n_val = n_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; uv_cpu_hub_info(cpu)->pnode = pnode; uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; - uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; + uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 503c1f2e8835..f99fb6acfe34 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -843,8 +843,8 @@ static int __init uv_bau_init(void) GFP_KERNEL, cpu_to_node(cur_cpu)); uv_bau_retry_limit = 1; - uv_nshift = uv_hub_info->n_val; - uv_mmask = (1UL << uv_hub_info->n_val) - 1; + uv_nshift = uv_hub_info->m_val; + uv_mmask = (1UL << uv_hub_info->m_val) - 1; nblades = uv_num_possible_blades(); uv_bau_table_bases = (struct bau_control **) -- cgit v1.2.1 From 93ae5012a79b11e7fc855b52c7ce1e16fe1540b0 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 15 Oct 2009 14:21:14 -0700 Subject: x86: Don't print number of MCE banks for every CPU The MCE initialization code explicitly says it doesn't handle asymmetric configurations where different CPUs support different numbers of MCE banks, and it prints a big warning in that case. Therefore, printing the "mce: CPU supports MCE banks" message into the kernel log for every CPU is pure redundancy that clutters the log significantly for systems with lots of CPUs. Signed-off-by: Roland Dreier LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b1598a9436d0..721a77ca8115 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1214,7 +1214,8 @@ static int __cpuinit mce_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + if (!banks) + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { printk(KERN_WARNING -- cgit v1.2.1 From 5e09954a9acc3b435ffe318b95afd3c02fae069f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 16 Oct 2009 12:31:32 +0200 Subject: x86, mce: Fix up MCE naming nomenclature Prefix global/setup routines with "mcheck_" thus differentiating from the internal facilities prefixed with "mce_". Also, prefix the per cpu calls with mcheck_cpu and rename them to reflect the MCE setup hierarchy of calls better. There should be no functionality change resulting from this patch. Signed-off-by: Borislav Petkov Cc: Andi Kleen LKML-Reference: <1255689093-26921-1-git-send-email-borislav.petkov@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 52 ++++++++++++++++++++-------------------- 2 files changed, 27 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc25c2b4a567..4df69a38be57 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -839,7 +839,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ - mcheck_init(c); + mcheck_cpu_init(c); #endif select_idle_routine(c); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0fb9dc50697e..68d968e69b13 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1136,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */ static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_timer(unsigned long data) +static void mce_start_timer(unsigned long data) { struct timer_list *t = &per_cpu(mce_timer, data); int *n; @@ -1220,7 +1220,7 @@ static int mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int __cpuinit mce_cap_init(void) +static int __cpuinit __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1258,7 +1258,7 @@ static int __cpuinit mce_cap_init(void) return 0; } -static void mce_init(void) +static void __mcheck_cpu_init_generic(void) { mce_banks_t all_banks; u64 cap; @@ -1287,7 +1287,7 @@ static void mce_init(void) } /* Add per CPU specific workarounds here */ -static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { if (c->x86_vendor == X86_VENDOR_UNKNOWN) { pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); @@ -1355,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) return 0; } -static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) return; @@ -1369,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) } } -static void mce_cpu_features(struct cpuinfo_x86 *c) +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: @@ -1383,7 +1383,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) } } -static void mce_init_timer(void) +static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); int *n = &__get_cpu_var(mce_next_interval); @@ -1394,7 +1394,7 @@ static void mce_init_timer(void) *n = check_interval * HZ; if (!*n) return; - setup_timer(t, mcheck_timer, smp_processor_id()); + setup_timer(t, mce_start_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); add_timer_on(t, smp_processor_id()); } @@ -1414,26 +1414,26 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) { if (mce_disabled) return; - mce_ancient_init(c); + __mcheck_cpu_ancient_init(c); if (!mce_available(c)) return; - if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { + if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { mce_disabled = 1; return; } machine_check_vector = do_machine_check; - mce_init(); - mce_cpu_features(c); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); if (raw_smp_processor_id() == 0) @@ -1665,7 +1665,7 @@ __setup("mce", mcheck_enable); * Disable machine checks on suspend and shutdown. We can't really handle * them later. */ -static int mce_disable(void) +static int mce_disable_error_reporting(void) { int i; @@ -1680,12 +1680,12 @@ static int mce_disable(void) static int mce_suspend(struct sys_device *dev, pm_message_t state) { - return mce_disable(); + return mce_disable_error_reporting(); } static int mce_shutdown(struct sys_device *dev) { - return mce_disable(); + return mce_disable_error_reporting(); } /* @@ -1695,8 +1695,8 @@ static int mce_shutdown(struct sys_device *dev) */ static int mce_resume(struct sys_device *dev) { - mce_init(); - mce_cpu_features(¤t_cpu_data); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(¤t_cpu_data); return 0; } @@ -1706,8 +1706,8 @@ static void mce_cpu_restart(void *data) del_timer_sync(&__get_cpu_var(mce_timer)); if (!mce_available(¤t_cpu_data)) return; - mce_init(); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_timer(); } /* Reinit MCEs after user configuration changes */ @@ -1733,7 +1733,7 @@ static void mce_enable_ce(void *all) cmci_reenable(); cmci_recheck(); if (all) - mce_init_timer(); + __mcheck_cpu_init_timer(); } static struct sysdev_class mce_sysclass = { @@ -2042,7 +2042,7 @@ static __init void mce_init_banks(void) } } -static __init int mce_init_device(void) +static __init int mcheck_init_device(void) { int err; int i = 0; @@ -2070,7 +2070,7 @@ static __init int mce_init_device(void) return err; } -device_initcall(mce_init_device); +device_initcall(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. @@ -2118,7 +2118,7 @@ static int fake_panic_set(void *data, u64 val) DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, "%llu\n"); -static int __init mce_debugfs_init(void) +static int __init mcheck_debugfs_init(void) { struct dentry *dmce, *ffake_panic; @@ -2132,5 +2132,5 @@ static int __init mce_debugfs_init(void) return 0; } -late_initcall(mce_debugfs_init); +late_initcall(mcheck_debugfs_init); #endif -- cgit v1.2.1 From b33a6363649f0ff83ec81597ea7fe7e688f973cb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 16 Oct 2009 12:31:33 +0200 Subject: x86, mce: Add a global MCE init helper Add an early initcall (pre SMP) which sets up global MCE functionality. Signed-off-by: Borislav Petkov Cc: Andi Kleen LKML-Reference: <1255689093-26921-2-git-send-email-borislav.petkov@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 68d968e69b13..80801705edd7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1436,8 +1436,6 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); - if (raw_smp_processor_id() == 0) - atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); } /* @@ -1657,6 +1655,14 @@ static int __init mcheck_enable(char *str) } __setup("mce", mcheck_enable); +static int __init mcheck_init(void) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); + + return 0; +} +early_initcall(mcheck_init); + /* * Sysfs support */ -- cgit v1.2.1 From 1d21e6e3ffad2939f9d8179817c6f9bc3b811b68 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Fri, 16 Oct 2009 06:29:20 -0500 Subject: x86, UV: Fix and clean up bau code to use uv_gpa_to_pnode() Create an inline function to extract the pnode from a global physical address and then convert the broadcast assist unit to use the newly created uv_gpa_to_pnode function. The open-coded code was wrong as well - it might explain a few of our unexplained bau hangs. Signed-off-by: Robin Holt Acked-by: Cliff Whickman Cc: linux-mm@kvack.org Cc: Jack Steiner LKML-Reference: <20091016112920.GZ8903@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index f99fb6acfe34..1740c85e24bb 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -23,8 +23,6 @@ static struct bau_control **uv_bau_table_bases __read_mostly; static int uv_bau_retry_limit __read_mostly; -/* position of pnode (which is nasid>>1): */ -static int uv_nshift __read_mostly; /* base pnode in this partition */ static int uv_partition_base_pnode __read_mostly; @@ -723,7 +721,7 @@ uv_activation_descriptor_init(int node, int pnode) BUG_ON(!adp); pa = uv_gpa(adp); /* need the real nasid*/ - n = pa >> uv_nshift; + n = uv_gpa_to_pnode(pa); m = pa & uv_mmask; uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, @@ -778,7 +776,7 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) * need the pnode of where the memory was really allocated */ pa = uv_gpa(pqp); - pn = pa >> uv_nshift; + pn = uv_gpa_to_pnode(pa); uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | @@ -843,7 +841,6 @@ static int __init uv_bau_init(void) GFP_KERNEL, cpu_to_node(cur_cpu)); uv_bau_retry_limit = 1; - uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nblades = uv_num_possible_blades(); -- cgit v1.2.1 From 72ed7de74e8f0fad0d8e567ae1f987b740accb3f Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 26 Oct 2009 11:11:43 +0100 Subject: x86: crash_dump: Fix non-pae kdump kernel memory accesses Non-PAE 32-bit dump kernels may wrap an address around 4G and poke unwanted space. ptes there are 32-bit long, and since pfn << PAGE_SIZE may exceed this limit, high pfn bits are cropped and wrong address mapped by kmap_atomic_pfn in copy_oldmem_page. Don't allow this behavior in non-PAE kdump kernels by checking pfns passed into copy_oldmem_page. In the case of failure, userspace process gets EFAULT. [v2] - fix comments - move ifdefs inside the function Signed-off-by: Jiri Slaby Cc: Vivek Goyal Cc: Eric W. Biederman Cc: Simon Horman Cc: Paul Mundt LKML-Reference: <1256551903-30567-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash_dump_32.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index f7cdb3b457aa..cd97ce18c29d 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -16,6 +16,22 @@ static void *kdump_buf_page; /* Stores the physical address of elf header of crash image. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +static inline bool is_crashed_pfn_valid(unsigned long pfn) +{ +#ifndef CONFIG_X86_PAE + /* + * non-PAE kdump kernel executed from a PAE one will crop high pte + * bits and poke unwanted space counting again from address 0, we + * don't want that. pte must fit into unsigned long. In fact the + * test checks high 12 bits for being zero (pfn will be shifted left + * by PAGE_SHIFT). + */ + return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn; +#else + return true; +#endif +} + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied @@ -41,6 +57,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!csize) return 0; + if (!is_crashed_pfn_valid(pfn)) + return -EFAULT; + vaddr = kmap_atomic_pfn(pfn, KM_PTE0); if (!userbuf) { -- cgit v1.2.1 From 772be899bc022ef2b911c3611b487d417e3269c3 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 20 Oct 2009 12:54:02 +0800 Subject: x86: Make EFI RTC function depend on 32bit again The EFI RTC functions are only available on 32 bit. commit 7bd867df (x86: Move get/set_wallclock to x86_platform_ops) removed the 32bit dependency which leads to boot crashes on 64bit EFI systems. Add the dependency back. Solves: http://bugzilla.kernel.org/show_bug.cgi?id=14466 Tested-by: Matthew Garrett Signed-off-by: Feng Tang LKML-Reference: <20091020125402.028d66d5@feng-desktop> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/efi.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index ad5bd988fb79..cdcfb122f256 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -454,8 +454,10 @@ void __init efi_init(void) if (add_efi_memmap) do_add_efi_memmap(); +#ifdef CONFIG_X86_32 x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; +#endif /* Setup for EFI runtime service */ reboot_type = BOOT_EFI; -- cgit v1.2.1 From 6f9b41006af1bc489030f84ee247abc0df1edccd Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 27 Oct 2009 11:01:38 +0100 Subject: x86, apic: Clear APIC Timer Initial Count Register on shutdown Commit a98f8fd24fb24fcb9a359553e64dd6aac5cf4279 (x86: apic reset counter on shutdown) set the counter to max to avoid spurious interrupts when the timer is re-enabled. (In theory) you'll still get a spurious interrupt if spending more than 344 seconds with this interrupt disabled and then unmasking it. The right thing to do is to clear the register. This disables the interrupt from happening (at least it does on AMD hardware). Signed-off-by: Andreas Herrmann LKML-Reference: <20091027100138.GB30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dce93d4b0eaf..4c689f45b238 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -444,7 +444,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0xffffffff); + apic_write(APIC_TMICT, 0); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ -- cgit v1.2.1 From ca0207114f1708b563f510b7781a360ec5b98359 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 28 Oct 2009 18:02:26 +0100 Subject: x86/amd-iommu: Un__init function required on shutdown The function iommu_feature_disable is required on system shutdown to disable the IOMMU but it is marked as __init. This may result in a panic if the memory is reused. This patch fixes this bug. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 498c8c79aaa5..1e423b2add15 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -240,7 +240,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } -static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; -- cgit v1.2.1 From f16250669d78a32bdfb27cec4d791e85141e11e2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:13 +0900 Subject: percpu: make percpu symbols in cpufreq unique This patch updates percpu related symbols in cpufreq such that percpu symbols are unique and don't clash with local symbols. This serves two purposes of decreasing the possibility of global percpu symbol collision and allowing dropping per_cpu__ prefix from percpu symbols. * drivers/cpufreq/cpufreq.c: s/policy_cpu/cpufreq_policy_cpu/ * drivers/cpufreq/freq_table.c: s/show_table/cpufreq_show_table/ * arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c: s/drv_data/acfreq_data/ s/old_perf/acfreq_old_perf/ Partly based on Rusty Russell's "alloc_percpu: rename percpu vars which cause name clashes" patch. Signed-off-by: Tejun Heo Cc: Rusty Russell --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 7d5c3b0ea8da..43eb3465dda7 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -68,9 +68,9 @@ struct acpi_cpufreq_data { unsigned int cpu_feature; }; -static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); +static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); -static DEFINE_PER_CPU(struct aperfmperf, old_perf); +static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -214,14 +214,14 @@ static u32 get_cur_val(const struct cpumask *mask) if (unlikely(cpumask_empty(mask))) return 0; - switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) { + switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; break; case SYSTEM_IO_CAPABLE: cmd.type = SYSTEM_IO_CAPABLE; - perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data; + perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data; cmd.addr.io.port = perf->control_register.address; cmd.addr.io.bit_width = perf->control_register.bit_width; break; @@ -268,8 +268,8 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy, if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) return 0; - ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); - per_cpu(old_perf, cpu) = perf; + ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); + per_cpu(acfreq_old_perf, cpu) = perf; retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; @@ -278,7 +278,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy, static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); unsigned int freq; unsigned int cached_freq; @@ -322,7 +322,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq, static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); struct acpi_processor_performance *perf; struct cpufreq_freqs freqs; struct drv_cmd cmd; @@ -416,7 +416,7 @@ out: static int acpi_cpufreq_verify(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); dprintk("acpi_cpufreq_verify\n"); @@ -563,7 +563,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) return -ENOMEM; data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); - per_cpu(drv_data, cpu) = data; + per_cpu(acfreq_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; @@ -714,20 +714,20 @@ err_unreg: acpi_processor_unregister_performance(perf, cpu); err_free: kfree(data); - per_cpu(drv_data, cpu) = NULL; + per_cpu(acfreq_data, cpu) = NULL; return result; } static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); dprintk("acpi_cpufreq_cpu_exit\n"); if (data) { cpufreq_frequency_table_put_attr(policy->cpu); - per_cpu(drv_data, policy->cpu) = NULL; + per_cpu(acfreq_data, policy->cpu) = NULL; acpi_processor_unregister_performance(data->acpi_data, policy->cpu); kfree(data); @@ -738,7 +738,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) static int acpi_cpufreq_resume(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); + struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); dprintk("acpi_cpufreq_resume\n"); -- cgit v1.2.1 From 0fe1e009541e925adc1748a605d8b66188e4b2ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Oct 2009 22:34:14 +0900 Subject: percpu: make percpu symbols in x86 unique This patch updates percpu related symbols in x86 such that percpu symbols are unique and don't clash with local symbols. This serves two purposes of decreasing the possibility of global percpu symbol collision and allowing dropping per_cpu__ prefix from percpu symbols. * arch/x86/kernel/cpu/common.c: rename local variable to avoid collision * arch/x86/kvm/svm.c: s/svm_data/sd/ for local variables to avoid collision * arch/x86/kernel/cpu/cpu_debug.c: s/cpu_arr/cpud_arr/ s/priv_arr/cpud_priv_arr/ s/cpu_priv_count/cpud_priv_count/ * arch/x86/kernel/cpu/intel_cacheinfo.c: s/cpuid4_info/ici_cpuid4_info/ s/cache_kobject/ici_cache_kobject/ s/index_kobject/ici_index_kobject/ * arch/x86/kernel/ds.c: s/cpu_context/cpu_ds_context/ Partly based on Rusty Russell's "alloc_percpu: rename percpu vars which cause name clashes" patch. Signed-off-by: Tejun Heo Acked-by: (kvm) Avi Kivity Cc: Rusty Russell Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Marcelo Tosatti Cc: x86@kernel.org --- arch/x86/kernel/cpu/common.c | 8 +++--- arch/x86/kernel/cpu/cpu_debug.c | 30 +++++++++---------- arch/x86/kernel/cpu/intel_cacheinfo.c | 54 +++++++++++++++++------------------ arch/x86/kernel/ds.c | 4 +-- 4 files changed, 48 insertions(+), 48 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc25c2b4a567..3192f22f2fdd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1093,7 +1093,7 @@ static void clear_all_debug_regs(void) void __cpuinit cpu_init(void) { - struct orig_ist *orig_ist; + struct orig_ist *oist; struct task_struct *me; struct tss_struct *t; unsigned long v; @@ -1102,7 +1102,7 @@ void __cpuinit cpu_init(void) cpu = stack_smp_processor_id(); t = &per_cpu(init_tss, cpu); - orig_ist = &per_cpu(orig_ist, cpu); + oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA if (cpu != 0 && percpu_read(node_number) == 0 && @@ -1143,12 +1143,12 @@ void __cpuinit cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!orig_ist->ist[0]) { + if (!oist->ist[0]) { char *estacks = per_cpu(exception_stacks, cpu); for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = + oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index dca325c03999..b368cd862997 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -30,9 +30,9 @@ #include #include -static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); -static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); -static DEFINE_PER_CPU(int, cpu_priv_count); +static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpud_arr); +static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], cpud_priv_arr); +static DEFINE_PER_CPU(int, cpud_priv_count); static DEFINE_MUTEX(cpu_debug_lock); @@ -531,7 +531,7 @@ static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, /* Already intialized */ if (file == CPU_INDEX_BIT) - if (per_cpu(cpu_arr[type].init, cpu)) + if (per_cpu(cpud_arr[type].init, cpu)) return 0; priv = kzalloc(sizeof(*priv), GFP_KERNEL); @@ -543,8 +543,8 @@ static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, priv->reg = reg; priv->file = file; mutex_lock(&cpu_debug_lock); - per_cpu(priv_arr[type], cpu) = priv; - per_cpu(cpu_priv_count, cpu)++; + per_cpu(cpud_priv_arr[type], cpu) = priv; + per_cpu(cpud_priv_count, cpu)++; mutex_unlock(&cpu_debug_lock); if (file) @@ -552,10 +552,10 @@ static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, dentry, (void *)priv, &cpu_fops); else { debugfs_create_file(cpu_base[type].name, S_IRUGO, - per_cpu(cpu_arr[type].dentry, cpu), + per_cpu(cpud_arr[type].dentry, cpu), (void *)priv, &cpu_fops); mutex_lock(&cpu_debug_lock); - per_cpu(cpu_arr[type].init, cpu) = 1; + per_cpu(cpud_arr[type].init, cpu) = 1; mutex_unlock(&cpu_debug_lock); } @@ -615,7 +615,7 @@ static int cpu_init_allreg(unsigned cpu, struct dentry *dentry) if (!is_typeflag_valid(cpu, cpu_base[type].flag)) continue; cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); - per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry; + per_cpu(cpud_arr[type].dentry, cpu) = cpu_dentry; if (type < CPU_TSS_BIT) err = cpu_init_msr(cpu, type, cpu_dentry); @@ -647,11 +647,11 @@ static int cpu_init_cpu(void) err = cpu_init_allreg(cpu, cpu_dentry); pr_info("cpu%d(%d) debug files %d\n", - cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu)); - if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) { + cpu, nr_cpu_ids, per_cpu(cpud_priv_count, cpu)); + if (per_cpu(cpud_priv_count, cpu) > MAX_CPU_FILES) { pr_err("Register files count %d exceeds limit %d\n", - per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES); - per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES; + per_cpu(cpud_priv_count, cpu), MAX_CPU_FILES); + per_cpu(cpud_priv_count, cpu) = MAX_CPU_FILES; err = -ENFILE; } if (err) @@ -676,8 +676,8 @@ static void __exit cpu_debug_exit(void) debugfs_remove_recursive(cpu_debugfs_dir); for (cpu = 0; cpu < nr_cpu_ids; cpu++) - for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++) - kfree(per_cpu(priv_arr[i], cpu)); + for (i = 0; i < per_cpu(cpud_priv_count, cpu); i++) + kfree(per_cpu(cpud_priv_arr[i], cpu)); } module_init(cpu_debug_init); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 804c40e2bc3e..f5ccb4fa5a5d 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -512,8 +512,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) #ifdef CONFIG_SYSFS /* pointer to _cpuid4_info array (for each cache leaf) */ -static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); -#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) +static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); +#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) #ifdef CONFIG_SMP static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) @@ -526,7 +526,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { struct cpuinfo_x86 *d; for_each_online_cpu(i) { - if (!per_cpu(cpuid4_info, i)) + if (!per_cpu(ici_cpuid4_info, i)) continue; d = &cpu_data(i); this_leaf = CPUID4_INFO_IDX(i, index); @@ -548,7 +548,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) c->apicid >> index_msb) { cpumask_set_cpu(i, to_cpumask(this_leaf->shared_cpu_map)); - if (i != cpu && per_cpu(cpuid4_info, i)) { + if (i != cpu && per_cpu(ici_cpuid4_info, i)) { sibling_leaf = CPUID4_INFO_IDX(i, index); cpumask_set_cpu(cpu, to_cpumask( @@ -587,8 +587,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) for (i = 0; i < num_cache_leaves; i++) cache_remove_shared_cpu_map(cpu, i); - kfree(per_cpu(cpuid4_info, cpu)); - per_cpu(cpuid4_info, cpu) = NULL; + kfree(per_cpu(ici_cpuid4_info, cpu)); + per_cpu(ici_cpuid4_info, cpu) = NULL; } static int @@ -627,15 +627,15 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) if (num_cache_leaves == 0) return -ENOENT; - per_cpu(cpuid4_info, cpu) = kzalloc( + per_cpu(ici_cpuid4_info, cpu) = kzalloc( sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); - if (per_cpu(cpuid4_info, cpu) == NULL) + if (per_cpu(ici_cpuid4_info, cpu) == NULL) return -ENOMEM; smp_call_function_single(cpu, get_cpu_leaves, &retval, true); if (retval) { - kfree(per_cpu(cpuid4_info, cpu)); - per_cpu(cpuid4_info, cpu) = NULL; + kfree(per_cpu(ici_cpuid4_info, cpu)); + per_cpu(ici_cpuid4_info, cpu) = NULL; } return retval; @@ -647,7 +647,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ /* pointer to kobject for cpuX/cache */ -static DEFINE_PER_CPU(struct kobject *, cache_kobject); +static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); struct _index_kobject { struct kobject kobj; @@ -656,8 +656,8 @@ struct _index_kobject { }; /* pointer to array of kobjects for cpuX/cache/indexY */ -static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); -#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) +static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); +#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) #define show_one_plus(file_name, object, val) \ static ssize_t show_##file_name \ @@ -876,10 +876,10 @@ static struct kobj_type ktype_percpu_entry = { static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) { - kfree(per_cpu(cache_kobject, cpu)); - kfree(per_cpu(index_kobject, cpu)); - per_cpu(cache_kobject, cpu) = NULL; - per_cpu(index_kobject, cpu) = NULL; + kfree(per_cpu(ici_cache_kobject, cpu)); + kfree(per_cpu(ici_index_kobject, cpu)); + per_cpu(ici_cache_kobject, cpu) = NULL; + per_cpu(ici_index_kobject, cpu) = NULL; free_cache_attributes(cpu); } @@ -895,14 +895,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) return err; /* Allocate all required memory */ - per_cpu(cache_kobject, cpu) = + per_cpu(ici_cache_kobject, cpu) = kzalloc(sizeof(struct kobject), GFP_KERNEL); - if (unlikely(per_cpu(cache_kobject, cpu) == NULL)) + if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL)) goto err_out; - per_cpu(index_kobject, cpu) = kzalloc( + per_cpu(ici_index_kobject, cpu) = kzalloc( sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); - if (unlikely(per_cpu(index_kobject, cpu) == NULL)) + if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL)) goto err_out; return 0; @@ -926,7 +926,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) if (unlikely(retval < 0)) return retval; - retval = kobject_init_and_add(per_cpu(cache_kobject, cpu), + retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), &ktype_percpu_entry, &sys_dev->kobj, "%s", "cache"); if (retval < 0) { @@ -940,12 +940,12 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_object->index = i; retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, - per_cpu(cache_kobject, cpu), + per_cpu(ici_cache_kobject, cpu), "index%1lu", i); if (unlikely(retval)) { for (j = 0; j < i; j++) kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); - kobject_put(per_cpu(cache_kobject, cpu)); + kobject_put(per_cpu(ici_cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); return retval; } @@ -953,7 +953,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); - kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); + kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD); return 0; } @@ -962,7 +962,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) unsigned int cpu = sys_dev->id; unsigned long i; - if (per_cpu(cpuid4_info, cpu) == NULL) + if (per_cpu(ici_cpuid4_info, cpu) == NULL) return; if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) return; @@ -970,7 +970,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) for (i = 0; i < num_cache_leaves; i++) kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); - kobject_put(per_cpu(cache_kobject, cpu)); + kobject_put(per_cpu(ici_cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); } diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index ef42a038f1a6..1c47390dd0e5 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -265,13 +265,13 @@ struct ds_context { int cpu; }; -static DEFINE_PER_CPU(struct ds_context *, cpu_context); +static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context); static struct ds_context *ds_get_context(struct task_struct *task, int cpu) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); + (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu)); struct ds_context *context = NULL; struct ds_context *new_context = NULL; -- cgit v1.2.1 From 16121d70fdf9eeb05ead46b241a293156323dbbe Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Sun, 1 Nov 2009 19:27:05 -0500 Subject: x86: Fix printk message typo in mtrr cleanup code Trivial typo. Signed-off-by: Dave Jones LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 315738c74aad..73c86db5acbe 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -846,7 +846,7 @@ int __init mtrr_cleanup(unsigned address_bits) sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); range_sums = sum_ranges(range, nr_range); - printk(KERN_INFO "total RAM coverred: %ldM\n", + printk(KERN_INFO "total RAM covered: %ldM\n", range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { -- cgit v1.2.1 From 05154752cf3767c544b65b5e340793d40b3f1229 Mon Sep 17 00:00:00 2001 From: Gottfried Haider Date: Mon, 2 Nov 2009 11:51:11 +0100 Subject: x86: Add reboot quirk for 3 series Mac mini MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reboot does not work out of the box on my "Early 2009" Mac mini (3,1). Detect this machine via DMI as we do for recent MacBooks. Signed-off-by: Gottfried Haider Cc: Ozan Çağlayan Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a1a3cdda06e1..f93078746e00 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -436,6 +436,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), }, }, + { /* Handle problems with rebooting on Apple Macmini3,1 */ + .callback = set_pci_reboot, + .ident = "Apple Macmini3,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), + }, + }, { } }; -- cgit v1.2.1 From 7a7732bc0f7c46f217dbec723f25366b6285cc42 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:31 -0800 Subject: x86: Unify fixup_irqs() for 32-bit and 64-bit kernels There is no reason to have different fixup_irqs() for 32-bit and 64-bit kernels. Unify by using the superior 64-bit version for both the kernels. Signed-off-by: Suresh Siddha Signed-off-by: Gary Hade Cc: Eric W. Biederman LKML-Reference: <20091026230001.562512739@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/irq_32.c | 45 ------------------------------------ arch/x86/kernel/irq_64.c | 58 ----------------------------------------------- 3 files changed, 59 insertions(+), 103 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 391206199515..3ea66556e5e1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -276,3 +276,62 @@ void smp_generic_interrupt(struct pt_regs *regs) } EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); + +#ifdef CONFIG_HOTPLUG_CPU +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irq; + static int warned; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + int break_affinity = 0; + int set_affinity = 1; + const struct cpumask *affinity; + + if (!desc) + continue; + if (irq == 2) + continue; + + /* interrupt's are disabled at this point */ + spin_lock(&desc->lock); + + affinity = desc->affinity; + if (!irq_has_action(irq) || + cpumask_equal(affinity, cpu_online_mask)) { + spin_unlock(&desc->lock); + continue; + } + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + break_affinity = 1; + affinity = cpu_all_mask; + } + + if (desc->chip->mask) + desc->chip->mask(irq); + + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, affinity); + else if (!(warned++)) + set_affinity = 0; + + if (desc->chip->unmask) + desc->chip->unmask(irq); + + spin_unlock(&desc->lock); + + if (break_affinity && set_affinity) + printk("Broke affinity for irq %i\n", irq); + else if (!set_affinity) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +} +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 7d35d0fe2329..10709f29d166 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) return true; } - -#ifdef CONFIG_HOTPLUG_CPU - -/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -void fixup_irqs(void) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - affinity = desc->affinity; - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - printk("Breaking affinity for irq %i\n", irq); - affinity = cpu_all_mask; - } - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (desc->action) - printk_once("Cannot set affinity for irq %i\n", irq); - } - -#if 0 - barrier(); - /* Ingo Molnar says: "after the IO-APIC masks have been redirected - [note the nop - the interrupt-enable boundary on x86 is two - instructions from sti] - to flush out pending hardirqs and - IPIs. After this point nothing is supposed to reach this CPU." */ - __asm__ __volatile__("sti; nop; cli"); - barrier(); -#else - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -#endif -} -#endif - diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 977d8b43a0dd..acf8fbf8fbda 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) return true; } -#ifdef CONFIG_HOTPLUG_CPU -/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -void fixup_irqs(void) -{ - unsigned int irq; - static int warned; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - int break_affinity = 0; - int set_affinity = 1; - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - /* interrupt's are disabled at this point */ - spin_lock(&desc->lock); - - affinity = desc->affinity; - if (!irq_has_action(irq) || - cpumask_equal(affinity, cpu_online_mask)) { - spin_unlock(&desc->lock); - continue; - } - - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - break_affinity = 1; - affinity = cpu_all_mask; - } - - if (desc->chip->mask) - desc->chip->mask(irq); - - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (!(warned++)) - set_affinity = 0; - - if (desc->chip->unmask) - desc->chip->unmask(irq); - - spin_unlock(&desc->lock); - - if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); - else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); - } - - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -} -#endif extern void call_softirq(void); -- cgit v1.2.1 From 84e21493a3b28c9fefe99fe827fc0c0c101a813d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:32 -0800 Subject: x86, intr-remap: Avoid irq_chip mask/unmask in fixup_irqs() for intr-remapping In the presence of interrupt-remapping, irqs will be migrated in the process context and we don't do (and there is no need to) irq_chip mask/unmask while migrating the interrupt. Similarly fix the fixup_irqs() that get called during cpu offline and avoid calling irq_chip mask/unmask for irqs that are ok to be migrated in the process context. While we didn't observe any race condition with the existing code, this change takes complete advantage of interrupt-remapping in the newer generation platforms and avoids any potential HW lockup's (that often worry Eric :) Signed-off-by: Suresh Siddha Acked-by: Eric W. Biederman Cc: garyhade@us.ibm.com LKML-Reference: <20091026230001.661423939@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3ea66556e5e1..342bcbca19b4 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -310,7 +310,7 @@ void fixup_irqs(void) affinity = cpu_all_mask; } - if (desc->chip->mask) + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) desc->chip->mask(irq); if (desc->chip->set_affinity) @@ -318,7 +318,7 @@ void fixup_irqs(void) else if (!(warned++)) set_affinity = 0; - if (desc->chip->unmask) + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) desc->chip->unmask(irq); spin_unlock(&desc->lock); -- cgit v1.2.1 From 23359a88e7eca3c4f402562b102f23014db3c2aa Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:33 -0800 Subject: x86: Remove move_cleanup_count from irq_cfg move_cleanup_count for each irq in irq_cfg is keeping track of the total number of cpus that need to free the corresponding vectors associated with the irq which has now been migrated to new destination. As long as this move_cleanup_count is non-zero (i.e., as long as we have n't freed the vector allocations on the old destinations) we were preventing the irq's further migration. This cleanup count is unnecessary and it is enough to not allow the irq migration till we send the cleanup vector to the previous irq destination, for which we already have irq_cfg's move_in_progress. All we need to make sure is that we free the vector at the old desintation but we don't need to wait till that gets freed. Signed-off-by: Suresh Siddha Acked-by: Gary Hade Cc: Eric W. Biederman LKML-Reference: <20091026230001.752968906@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ce16b65cfdcc..e9e5b02c3af2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1161,7 +1161,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int cpu, err; cpumask_var_t tmp_mask; - if ((cfg->move_in_progress) || cfg->move_cleanup_count) + if (cfg->move_in_progress) return -EBUSY; if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) @@ -2234,14 +2234,10 @@ void send_cleanup_vector(struct irq_cfg *cfg) if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } @@ -2430,8 +2426,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) cfg = irq_cfg(irq); spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; @@ -2449,7 +2443,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) goto unlock; } __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; unlock: spin_unlock(&desc->lock); } -- cgit v1.2.1 From a5e74b841930bec78a4684ab9f208b2ddfe7c736 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:34 -0800 Subject: x86: Force irq complete move during cpu offline When a cpu goes offline, fixup_irqs() try to move irq's currently destined to the offline cpu to a new cpu. But this attempt will fail if the irq is recently moved to this cpu and the irq still hasn't arrived at this cpu (for non intr-remapping platforms this is when we free the vector allocation at the previous destination) that is about to go offline. This will endup with the interrupt subsystem still pointing the irq to the offline cpu, causing that irq to not work any more. Fix this by forcing the irq to complete its move (its been a long time we moved the irq to this cpu which we are offlining now) and then move this irq to a new cpu before this cpu goes offline. Signed-off-by: Suresh Siddha Acked-by: Gary Hade Cc: Eric W. Biederman LKML-Reference: <20091026230001.848830905@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 18 +++++++++++++++--- arch/x86/kernel/irq.c | 7 +++++++ 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e9e5b02c3af2..4e886efd9a15 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2450,21 +2450,33 @@ unlock: irq_exit(); } -static void irq_complete_move(struct irq_desc **descp) +static void __irq_complete_move(struct irq_desc **descp, unsigned vector) { struct irq_desc *desc = *descp; struct irq_cfg *cfg = desc->chip_data; - unsigned vector, me; + unsigned me; if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); } + +static void irq_complete_move(struct irq_desc **descp) +{ + __irq_complete_move(descp, ~get_irq_regs()->orig_ax); +} + +void irq_force_complete_move(int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + + __irq_complete_move(&desc, cfg->vector); +} #else static inline void irq_complete_move(struct irq_desc **descp) {} #endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 342bcbca19b4..b10a5e1da06c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -305,6 +305,13 @@ void fixup_irqs(void) continue; } + /* + * Complete the irq move. This cpu is going down and for + * non intr-remapping case, we can't wait till this interrupt + * arrives at this cpu before completing the irq move. + */ + irq_force_complete_move(irq); + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; affinity = cpu_all_mask; -- cgit v1.2.1 From b3ec0a37a7907813bb4fb85a2d94102c152470b7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:35 -0800 Subject: x86: Use EOI register in io-apic on intel platforms IO-APIC's in intel chipsets support EOI register starting from IO-APIC version 2. Use that when ever we need to clear the IO-APIC RTE's RemoteIRR bit explicitly. Signed-off-by: Suresh Siddha Acked-by: Gary Hade Cc: Eric W. Biederman LKML-Reference: <20091026230001.947855317@sbs-t61.sc.intel.com> [ Marked use_eio_reg as __read_mostly, fixed small details ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 81 ++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4e886efd9a15..31e9db3c12ad 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2492,6 +2492,51 @@ static void ack_apic_edge(unsigned int irq) atomic_t irq_mis_count; +static int use_eoi_reg __read_mostly; + +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + if (irq_remapped(irq)) + io_apic_eoi(entry->apic, entry->pin); + else + io_apic_eoi(entry->apic, cfg->vector); + } +} + +static void eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static int ioapic_supports_eoi(void) +{ + struct pci_dev *root; + + root = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0)); + if (root && root->vendor == PCI_VENDOR_ID_INTEL && + mp_ioapics[0].apicver >= 0x2) { + use_eoi_reg = 1; + printk(KERN_INFO "IO-APIC supports EOI register\n"); + } else + printk(KERN_INFO "IO-APIC doesn't support EOI\n"); + + return 0; +} + +fs_initcall(ioapic_supports_eoi); + static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2575,37 +2620,19 @@ static void ack_apic_level(unsigned int irq) /* Tail end of version 0x11 I/O APIC bug workaround */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(cfg); - __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); + + if (use_eoi_reg) + eoi_ioapic_irq(desc); + else { + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(cfg); + __unmask_and_level_IO_APIC_irq(cfg); + spin_unlock(&ioapic_lock); + } } } #ifdef CONFIG_INTR_REMAP -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - - for_each_irq_pin(entry, cfg->irq_2_pin) - io_apic_eoi(entry->apic, entry->pin); -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - static void ir_ack_apic_edge(unsigned int irq) { ack_APIC_irq(); -- cgit v1.2.1 From 5231a68614b94f60e8f6c56bc6e3d75955b9e75e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Oct 2009 14:24:36 -0800 Subject: x86: Remove local_irq_enable()/local_irq_disable() in fixup_irqs() To ensure that we handle all the pending interrupts (destined for this cpu that is going down) in the interrupt subsystem before the cpu goes offline, fixup_irqs() does: local_irq_enable(); mdelay(1); local_irq_disable(); Enabling interrupts is not a good thing as this cpu is already offline. So this patch replaces that logic with, mdelay(1); check APIC_IRR bits Retrigger the irq at the new destination if any interrupt has arrived via IPI. For IO-APIC level triggered interrupts, this retrigger IPI will appear as an edge interrupt. ack_apic_level() will detect this condition and IO-APIC RTE's remoteIRR is cleared using directed EOI(using IO-APIC EOI register) on Intel platforms and for others it uses the existing mask+edge logic followed by unmask+level. We can also remove mdelay() and then send spuriuous interrupts to new cpu targets for all the irqs that were handled previously by this cpu that is going offline. While it works, I have seen spurious interrupt messages (nothing wrong but still annoying messages during cpu offline, which can be seen during suspend/resume etc) Signed-off-by: Suresh Siddha Acked-by: Gary Hade Cc: Eric W. Biederman LKML-Reference: <20091026230002.043281924@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b10a5e1da06c..8a82728d47c1 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -281,7 +281,7 @@ EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { - unsigned int irq; + unsigned int irq, vector; static int warned; struct irq_desc *desc; @@ -336,9 +336,33 @@ void fixup_irqs(void) printk("Cannot set affinity for irq %i\n", irq); } - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); + /* + * We can remove mdelay() and then send spuriuous interrupts to + * new cpu targets for all the irqs that were handled previously by + * this cpu. While it works, I have seen spurious interrupt messages + * (nothing wrong but still...). + * + * So for now, retain mdelay(1) and check the IRR and then send those + * interrupts to new targets as this cpu is already offlined... + */ mdelay(1); - local_irq_disable(); + + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irr; + + if (__get_cpu_var(vector_irq)[vector] < 0) + continue; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1 << (vector % 32))) { + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + spin_lock(&desc->lock); + if (desc->chip->retrigger) + desc->chip->retrigger(irq); + spin_unlock(&desc->lock); + } + } } #endif -- cgit v1.2.1 From a489ca355efaf9efa4990b0f8f30ab650a206273 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 2 Nov 2009 16:59:15 -0800 Subject: x86: Make sure we also print a Code: line for show_regs() show_regs() is called as a mini BUG() equivalent in some places, specifically for the "scheduling while atomic" case. Unfortunately right now it does not print a Code: line unlike a real bug/oops. This patch changes the x86 implementation of show_regs() so that it calls the same function as oopses do to print the registers as well as the Code: line. Signed-off-by: Arjan van de Ven LKML-Reference: <20091102165915.4a980fc0@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4cf79567cdab..e658331edb6d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -187,7 +187,7 @@ void __show_regs(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { - __show_regs(regs, 1); + show_registers(regs); show_trace(NULL, regs, ®s->sp, regs->bp); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad535b683170..2386999bfcd2 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -226,8 +226,7 @@ void __show_regs(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { - printk(KERN_INFO "CPU %d:", smp_processor_id()); - __show_regs(regs, 1); + show_registers(regs); show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } -- cgit v1.2.1 From 41a48d14f6991020c9bb6b93e289ca5b411ed09a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 5 Oct 2009 19:23:06 +0900 Subject: x86/hw-breakpoints: Actually flush thread breakpoints in flush_thread(). flush_thread() tries to do a TIF_DEBUG check before calling in to flush_thread_hw_breakpoint() (which subsequently clears the thread flag), but for some reason, the x86 code is manually clearing TIF_DEBUG immediately before the test, so this path will never be taken. This kills off the erroneous clear_tsk_thread_flag() and lets flush_thread_hw_breakpoint() actually get invoked. Presumably folks were getting lucky with testing and the free_thread_info() -> free_thread_xstate() path was taking care of the flush there. Signed-off-by: Paul Mundt Acked-by: "K.Prasad" Cc: Ingo Molnar Cc: Alan Stern LKML-Reference: <20091005102306.GA7889@linux-sh.org> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/process.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2275ce5776de..cf8ee0016307 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -107,8 +107,6 @@ void flush_thread(void) } #endif - clear_tsk_thread_flag(tsk, TIF_DEBUG); - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) flush_thread_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); -- cgit v1.2.1 From 97829de5a3b88899c5f3ac8802d11868bf4180ba Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 3 Nov 2009 14:02:05 -0500 Subject: x86, 64-bit: Fix bstep_iret jump This jump should be unconditional. Signed-off-by: Brian Gerst LKML-Reference: <1257274925-15713-1-git-send-email-brgerst@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index af0f4b226dbe..1579a6c59cfd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1501,7 +1501,7 @@ error_kernelspace: bstep_iret: /* Fix truncated RIP */ movq %rcx,RIP+8(%rsp) - je error_swapgs + jmp error_swapgs END(error_entry) -- cgit v1.2.1 From ce7c42710e2dd133f10b7fc9ed9c73bdd2435f7a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:53:52 +1030 Subject: cpumask: Avoid cpumask_t in arch/x86/kernel/apic/nmi.c Ingo wants the certainty of a static cpumask (rather than a cpumask_var_t), but cpumask_t will some day be undefined to avoid on-stack declarations. This is what DECLARE_BITMAP/to_cpumask() is for. Signed-off-by: Rusty Russell LKML-Reference: <200911031453.52394.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/nmi.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 7ff61d6a188a..6389432a9dbf 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,8 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_t backtrace_mask __read_mostly; +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -414,7 +415,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) } /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (cpumask_test_cpu(cpu, &backtrace_mask)) { + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); @@ -422,7 +423,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) show_regs(regs); dump_stack(); spin_unlock(&lock); - cpumask_clear_cpu(cpu, &backtrace_mask); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); rc = 1; } @@ -558,14 +559,14 @@ void arch_trigger_all_cpu_backtrace(void) { int i; - cpumask_copy(&backtrace_mask, cpu_online_mask); + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); printk(KERN_INFO "sending NMI to all CPUs:\n"); apic->send_IPI_all(NMI_VECTOR); /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(&backtrace_mask)) + if (cpumask_empty(to_cpumask(backtrace_mask))) break; mdelay(1); } -- cgit v1.2.1 From 6ac5c5310ca9d7dd3d7e677c2715b1f06a348330 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:58:30 +1030 Subject: cpumask: Use modern cpumask style in arch/x86/kernel/cpu/mcheck/mce-inject.c Note that there's no freeing the cpu var, since this module has no unload function. Signed-off-by: Rusty Russell Cc: Andi Kleen Cc: Huang Ying LKML-Reference: <200911031458.30987.rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 472763d92098..73734baa50f2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -74,7 +74,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) m->finished = 0; } -static cpumask_t mce_inject_cpumask; +static cpumask_var_t mce_inject_cpumask; static int mce_raise_notify(struct notifier_block *self, unsigned long val, void *data) @@ -82,9 +82,9 @@ static int mce_raise_notify(struct notifier_block *self, struct die_args *args = (struct die_args *)data; int cpu = smp_processor_id(); struct mce *m = &__get_cpu_var(injectm); - if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) return NOTIFY_DONE; - cpu_clear(cpu, mce_inject_cpumask); + cpumask_clear_cpu(cpu, mce_inject_cpumask); if (m->inject_flags & MCJ_EXCEPTION) raise_exception(m, args->regs); else if (m->status) @@ -148,22 +148,22 @@ static void raise_mce(struct mce *m) unsigned long start; int cpu; get_online_cpus(); - mce_inject_cpumask = cpu_online_map; - cpu_clear(get_cpu(), mce_inject_cpumask); + cpumask_copy(mce_inject_cpumask, cpu_online_mask); + cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); for_each_online_cpu(cpu) { struct mce *mcpu = &per_cpu(injectm, cpu); if (!mcpu->finished || MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) - cpu_clear(cpu, mce_inject_cpumask); + cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpus_empty(mce_inject_cpumask)) - apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + if (!cpumask_empty(mce_inject_cpumask)) + apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); start = jiffies; - while (!cpus_empty(mce_inject_cpumask)) { + while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR "Timeout waiting for mce inject NMI %lx\n", - *cpus_addr(mce_inject_cpumask)); + *cpumask_bits(mce_inject_cpumask)); break; } cpu_relax(); @@ -210,6 +210,8 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, static int inject_init(void) { + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) + return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; register_die_notifier(&mce_raise_nb); -- cgit v1.2.1 From 89240ba059ca468ae7a8346edf7f95082458c2fc Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Tue, 3 Nov 2009 10:22:40 +0100 Subject: x86, fs: Fix x86 procfs stack information for threads on 64-bit This patch fixes two issues in the procfs stack information on x86-64 linux. The 32 bit loader compat_do_execve did not store stack start. (this was figured out by Alexey Dobriyan). The stack information on a x64_64 kernel always shows 0 kbyte stack usage, because of a missing implementation of the KSTK_ESP macro which always returned -1. The new implementation now returns the right value. Signed-off-by: Stefani Seibold Cc: Americo Wang Cc: Alexey Dobriyan Cc: Al Viro Cc: Andrew Morton LKML-Reference: <1257240160.4889.24.camel@wall-e> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad535b683170..eb62cbcaa490 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -664,3 +664,8 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } +unsigned long KSTK_ESP(struct task_struct *task) +{ + return (test_tsk_thread_flag(task, TIF_IA32)) ? + (task_pt_regs(task)->sp) : ((task)->thread.usersp); +} -- cgit v1.2.1 From f1b291d4c47440cbfc1a478e88800e2742d60a80 Mon Sep 17 00:00:00 2001 From: Simon Kagstrom Date: Fri, 6 Nov 2009 15:44:04 +0100 Subject: x86: Add Phoenix/MSC BIOSes to lowmem corruption list We have a board with a Phoenix/MSC BIOS which also corrupts the low 64KB of RAM, so add an entry to the table. Signed-off-by: Simon Kagstrom LKML-Reference: <20091106154404.002648d9@marrow.netinsight.se> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e09f0e2c14b5..2a34f9c5be21 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -659,6 +659,13 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix/MSC BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), + }, + }, { /* * AMI BIOS with low memory corruption was found on Intel DG45ID board. -- cgit v1.2.1 From 338bac527ed0e35b4cb50390972f15d3cbce92ca Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 27 Oct 2009 16:34:44 +0900 Subject: x86: Use x86_platform for iommu_shutdown This patch cleans up pci_iommu_shutdown() a bit to use x86_platform (similar to how IA64 initializes an IOMMU driver). This adds iommu_shutdown() to x86_platform to avoid calling every IOMMUs' shutdown functions in pci_iommu_shutdown() in order. The IOMMU shutdown functions are platform specific (we don't have multiple different IOMMU hardware) so the current way is pointless. An IOMMU driver sets x86_platform.iommu_shutdown to the shutdown function if necessary. Signed-off-by: FUJITA Tomonori Cc: joerg.roedel@amd.com LKML-Reference: <20091027163358F.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 6 +----- arch/x86/kernel/crash.c | 5 ++--- arch/x86/kernel/pci-dma.c | 7 ------- arch/x86/kernel/pci-gart_64.c | 6 ++++-- arch/x86/kernel/reboot.c | 4 ++-- arch/x86/kernel/x86_init.c | 2 ++ 6 files changed, 11 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c20001e4f556..6acd43e9afd7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1297,6 +1297,7 @@ int __init amd_iommu_init(void) else printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); + x86_platform.iommu_shutdown = disable_iommus; out: return ret; @@ -1323,11 +1324,6 @@ free: goto out; } -void amd_iommu_shutdown(void) -{ - disable_iommus(); -} - /**************************************************************************** * * Early detect code. This code runs at IOMMU detection time in the DMA diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 5e409dc298a4..a4849c10a77e 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,8 +27,7 @@ #include #include #include -#include - +#include #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif crash_save_cpu(regs, safe_smp_processor_id()); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index b2a71dca5642..ce2fb91bbed1 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -303,13 +303,6 @@ static int __init pci_iommu_init(void) no_iommu_init(); return 0; } - -void pci_iommu_shutdown(void) -{ - gart_iommu_shutdown(); - - amd_iommu_shutdown(); -} /* Must execute after PCI subsystem */ rootfs_initcall(pci_iommu_init); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index a7f1b64f86e0..a9bcdf7c8801 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -39,6 +39,7 @@ #include #include #include +#include static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -688,12 +689,12 @@ static struct dma_map_ops gart_dma_ops = { .free_coherent = gart_free_coherent, }; -void gart_iommu_shutdown(void) +static void gart_iommu_shutdown(void) { struct pci_dev *dev; int i; - if (no_agp && (dma_ops != &gart_dma_ops)) + if (no_agp) return; for (i = 0; i < num_k8_northbridges; i++) { @@ -838,6 +839,7 @@ void __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; + x86_platform.iommu_shutdown = gart_iommu_shutdown; } void __init gart_parse_options(char *p) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f93078746e00..2b97fc5b124e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -23,7 +23,7 @@ # include # include #else -# include +# include #endif /* @@ -622,7 +622,7 @@ void native_machine_shutdown(void) #endif #ifdef CONFIG_X86_64 - pci_iommu_shutdown(); + x86_platform.iommu_shutdown(); #endif } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 4449a4a2c2ed..bc9b230ef402 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,6 +14,7 @@ #include #include #include +#include void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -72,4 +73,5 @@ struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, + .iommu_shutdown = iommu_shutdown_noop, }; -- cgit v1.2.1 From 2ae8bb75db1f3de422eb5898f2a063c46c36dba8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Oct 2009 15:41:46 +0100 Subject: x86: Fix iommu=nodac parameter handling iommu=nodac should forbid dac instead of enabling it. Fix it. Signed-off-by: Tejun Heo Acked-by: FUJITA Tomonori Cc: Matteo Frigo Cc: # .32.x and older LKML-Reference: <4AE5B52A.4050408@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index ce2fb91bbed1..839d49a669bc 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -216,7 +216,7 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "allowdac", 8)) forbid_dac = 0; if (!strncmp(p, "nodac", 5)) - forbid_dac = -1; + forbid_dac = 1; if (!strncmp(p, "usedac", 6)) { forbid_dac = -1; return 1; -- cgit v1.2.1 From 24f1e32c60c45c89a997c73395b69c8af6f0a84e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 9 Sep 2009 19:22:48 +0200 Subject: hw-breakpoints: Rewrite the hw-breakpoints layer on top of perf events This patch rebase the implementation of the breakpoints API on top of perf events instances. Each breakpoints are now perf events that handle the register scheduling, thread/cpu attachment, etc.. The new layering is now made as follows: ptrace kgdb ftrace perf syscall \ | / / \ | / / / Core breakpoint API / / | / | / Breakpoints perf events | | Breakpoints PMU ---- Debug Register constraints handling (Part of core breakpoint API) | | Hardware debug registers Reasons of this rewrite: - Use the centralized/optimized pmu registers scheduling, implying an easier arch integration - More powerful register handling: perf attributes (pinned/flexible events, exclusive/non-exclusive, tunable period, etc...) Impact: - New perf ABI: the hardware breakpoints counters - Ptrace breakpoints setting remains tricky and still needs some per thread breakpoints references. Todo (in the order): - Support breakpoints perf counter events for perf tools (ie: implement perf_bpcounter_event()) - Support from perf tools Changes in v2: - Follow the perf "event " rename - The ptrace regression have been fixed (ptrace breakpoint perf events weren't released when a task ended) - Drop the struct hw_breakpoint and store generic fields in perf_event_attr. - Separate core and arch specific headers, drop asm-generic/hw_breakpoint.h and create linux/hw_breakpoint.h - Use new generic len/type for breakpoint - Handle off case: when breakpoints api is not supported by an arch Changes in v3: - Fix broken CONFIG_KVM, we need to propagate the breakpoint api changes to kvm when we exit the guest and restore the bp registers to the host. Changes in v4: - Drop the hw_breakpoint_restore() stub as it is only used by KVM - EXPORT_SYMBOL_GPL hw_breakpoint_restore() as KVM can be built as a module - Restore the breakpoints unconditionally on kvm guest exit: TIF_DEBUG_THREAD doesn't anymore cover every cases of running breakpoints and vcpu->arch.switch_db_regs might not always be set when the guest used debug registers. (Waiting for a reliable optimization) Changes in v5: - Split-up the asm-generic/hw-breakpoint.h moving to linux/hw_breakpoint.h into a separate patch - Optimize the breakpoints restoring while switching from kvm guest to host. We only want to restore the state if we have active breakpoints to the host, otherwise we don't care about messed-up address registers. - Add asm/hw_breakpoint.h to Kbuild - Fix bad breakpoint type in trace_selftest.c Changes in v6: - Fix wrong header inclusion in trace.h (triggered a build error with CONFIG_FTRACE_SELFTEST Signed-off-by: Frederic Weisbecker Cc: Prasad Cc: Alan Stern Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Ingo Molnar Cc: Jan Kiszka Cc: Jiri Slaby Cc: Li Zefan Cc: Avi Kivity Cc: Paul Mackerras Cc: Mike Galbraith Cc: Masami Hiramatsu Cc: Paul Mundt --- arch/x86/kernel/hw_breakpoint.c | 391 ++++++++++++++++++++++++++-------------- arch/x86/kernel/process.c | 7 +- arch/x86/kernel/process_32.c | 26 +-- arch/x86/kernel/process_64.c | 26 +-- arch/x86/kernel/ptrace.c | 182 +++++++++++++------ arch/x86/kernel/smpboot.c | 3 - 6 files changed, 385 insertions(+), 250 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 9316a9de4de3..e622620790bd 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -15,6 +15,7 @@ * * Copyright (C) 2007 Alan Stern * Copyright (C) 2009 IBM Corporation + * Copyright (C) 2009 Frederic Weisbecker */ /* @@ -22,6 +23,8 @@ * using the CPU's debug registers. */ +#include +#include #include #include #include @@ -38,26 +41,24 @@ #include #include -/* Unmasked kernel DR7 value */ -static unsigned long kdr7; +/* Per cpu debug control register value */ +DEFINE_PER_CPU(unsigned long, dr7); + +/* Per cpu debug address registers values */ +static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); /* - * Masks for the bits corresponding to registers DR0 - DR3 in DR7 register. - * Used to clear and verify the status of bits corresponding to DR0 - DR3 + * Stores the breakpoints currently in use on each breakpoint address + * register for each cpus */ -static const unsigned long dr7_masks[HBP_NUM] = { - 0x000f0003, /* LEN0, R/W0, G0, L0 */ - 0x00f0000c, /* LEN1, R/W1, G1, L1 */ - 0x0f000030, /* LEN2, R/W2, G2, L2 */ - 0xf00000c0 /* LEN3, R/W3, G3, L3 */ -}; +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); /* * Encode the length, type, Exact, and Enable bits for a particular breakpoint * as stored in debug register 7. */ -static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) { unsigned long bp_info; @@ -68,64 +69,89 @@ static unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) return bp_info; } -void arch_update_kernel_hw_breakpoint(void *unused) +/* + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. + */ +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) { - struct hw_breakpoint *bp; - int i, cpu = get_cpu(); - unsigned long temp_kdr7 = 0; - - /* Don't allow debug exceptions while we update the registers */ - set_debugreg(0UL, 7); + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); - for (i = hbp_kernel_pos; i < HBP_NUM; i++) { - per_cpu(this_hbp_kernel[i], cpu) = bp = hbp_kernel[i]; - if (bp) { - temp_kdr7 |= encode_dr7(i, bp->info.len, bp->info.type); - set_debugreg(bp->info.address, i); - } - } + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; - /* No need to set DR6. Update the debug registers with kernel-space - * breakpoint values from kdr7 and user-space requests from the - * current process - */ - kdr7 = temp_kdr7; - set_debugreg(kdr7 | current->thread.debugreg7, 7); - put_cpu(); + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; } /* - * Install the thread breakpoints in their debug registers. + * Install a perf counter breakpoint. + * + * We seek a free debug address register and use it for this + * breakpoint. Eventually we enable it in the debug control register. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. */ -void arch_install_thread_hw_breakpoint(struct task_struct *tsk) +int arch_install_hw_breakpoint(struct perf_event *bp) { - struct thread_struct *thread = &(tsk->thread); - - switch (hbp_kernel_pos) { - case 4: - set_debugreg(thread->debugreg[3], 3); - case 3: - set_debugreg(thread->debugreg[2], 2); - case 2: - set_debugreg(thread->debugreg[1], 1); - case 1: - set_debugreg(thread->debugreg[0], 0); - default: - break; + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (!*slot) { + *slot = bp; + break; + } } - /* No need to set DR6 */ - set_debugreg((kdr7 | thread->debugreg7), 7); + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return -EBUSY; + + set_debugreg(info->address, i); + __get_cpu_var(cpu_debugreg[i]) = info->address; + + dr7 = &__get_cpu_var(dr7); + *dr7 |= encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); + + return 0; } /* - * Install the debug register values for just the kernel, no thread. + * Uninstall the breakpoint contained in the given counter. + * + * First we search the debug address register it uses and then we disable + * it. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. */ -void arch_uninstall_thread_hw_breakpoint(void) +void arch_uninstall_hw_breakpoint(struct perf_event *bp) { - /* Clear the user-space portion of debugreg7 by setting only kdr7 */ - set_debugreg(kdr7, 7); + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (*slot == bp) { + *slot = NULL; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return; + dr7 = &__get_cpu_var(dr7); + *dr7 &= ~encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); } static int get_hbp_len(u8 hbp_len) @@ -133,17 +159,17 @@ static int get_hbp_len(u8 hbp_len) unsigned int len_in_bytes = 0; switch (hbp_len) { - case HW_BREAKPOINT_LEN_1: + case X86_BREAKPOINT_LEN_1: len_in_bytes = 1; break; - case HW_BREAKPOINT_LEN_2: + case X86_BREAKPOINT_LEN_2: len_in_bytes = 2; break; - case HW_BREAKPOINT_LEN_4: + case X86_BREAKPOINT_LEN_4: len_in_bytes = 4; break; #ifdef CONFIG_X86_64 - case HW_BREAKPOINT_LEN_8: + case X86_BREAKPOINT_LEN_8: len_in_bytes = 8; break; #endif @@ -178,67 +204,146 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) /* * Store a breakpoint's encoded address, length, and type. */ -static int arch_store_info(struct hw_breakpoint *bp, struct task_struct *tsk) +static int arch_store_info(struct perf_event *bp) { - /* - * User-space requests will always have the address field populated - * Symbol names from user-space are rejected - */ - if (tsk && bp->info.name) - return -EINVAL; + struct arch_hw_breakpoint *info = counter_arch_bp(bp); /* * For kernel-addresses, either the address or symbol name can be * specified. */ - if (bp->info.name) - bp->info.address = (unsigned long) - kallsyms_lookup_name(bp->info.name); - if (bp->info.address) + if (info->name) + info->address = (unsigned long) + kallsyms_lookup_name(info->name); + if (info->address) return 0; + return -EINVAL; } -/* - * Validate the arch-specific HW Breakpoint register settings - */ -int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, - struct task_struct *tsk) +int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type) { - unsigned int align; - int ret = -EINVAL; + /* Len */ + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + *gen_len = HW_BREAKPOINT_LEN_1; + break; + case X86_BREAKPOINT_LEN_2: + *gen_len = HW_BREAKPOINT_LEN_2; + break; + case X86_BREAKPOINT_LEN_4: + *gen_len = HW_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + *gen_len = HW_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } - switch (bp->info.type) { - /* - * Ptrace-refactoring code - * For now, we'll allow instruction breakpoint only for user-space - * addresses - */ - case HW_BREAKPOINT_EXECUTE: - if ((!arch_check_va_in_userspace(bp->info.address, - bp->info.len)) && - bp->info.len != HW_BREAKPOINT_LEN_EXECUTE) - return ret; + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + *gen_type = HW_BREAKPOINT_X; break; - case HW_BREAKPOINT_WRITE: + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; break; - case HW_BREAKPOINT_RW: + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; break; default: - return ret; + return -EINVAL; } - switch (bp->info.len) { + return 0; +} + + +static int arch_build_bp_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + info->address = bp->attr.bp_addr; + + /* Len */ + switch (bp->attr.bp_len) { case HW_BREAKPOINT_LEN_1: - align = 0; + info->len = X86_BREAKPOINT_LEN_1; break; case HW_BREAKPOINT_LEN_2: - align = 1; + info->len = X86_BREAKPOINT_LEN_2; break; case HW_BREAKPOINT_LEN_4: - align = 3; + info->len = X86_BREAKPOINT_LEN_4; break; #ifdef CONFIG_X86_64 case HW_BREAKPOINT_LEN_8: + info->len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + break; + default: + return -EINVAL; + } + + return 0; +} +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct perf_event *bp, + struct task_struct *tsk) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned int align; + int ret; + + + ret = arch_build_bp_info(bp); + if (ret) + return ret; + + ret = -EINVAL; + + if (info->type == X86_BREAKPOINT_EXECUTE) + /* + * Ptrace-refactoring code + * For now, we'll allow instruction breakpoint only for user-space + * addresses + */ + if ((!arch_check_va_in_userspace(info->address, info->len)) && + info->len != X86_BREAKPOINT_EXECUTE) + return ret; + + switch (info->len) { + case X86_BREAKPOINT_LEN_1: + align = 0; + break; + case X86_BREAKPOINT_LEN_2: + align = 1; + break; + case X86_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: align = 7; break; #endif @@ -246,8 +351,8 @@ int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, return ret; } - if (bp->triggered) - ret = arch_store_info(bp, tsk); + if (bp->callback) + ret = arch_store_info(bp); if (ret < 0) return ret; @@ -255,44 +360,47 @@ int arch_validate_hwbkpt_settings(struct hw_breakpoint *bp, * Check that the low-order bits of the address are appropriate * for the alignment implied by len. */ - if (bp->info.address & align) + if (info->address & align) return -EINVAL; /* Check that the virtual address is in the proper range */ if (tsk) { - if (!arch_check_va_in_userspace(bp->info.address, bp->info.len)) + if (!arch_check_va_in_userspace(info->address, info->len)) return -EFAULT; } else { - if (!arch_check_va_in_kernelspace(bp->info.address, - bp->info.len)) + if (!arch_check_va_in_kernelspace(info->address, info->len)) return -EFAULT; } + return 0; } -void arch_update_user_hw_breakpoint(int pos, struct task_struct *tsk) +/* + * Release the user breakpoints used by ptrace + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { - struct thread_struct *thread = &(tsk->thread); - struct hw_breakpoint *bp = thread->hbp[pos]; - - thread->debugreg7 &= ~dr7_masks[pos]; - if (bp) { - thread->debugreg[pos] = bp->info.address; - thread->debugreg7 |= encode_dr7(pos, bp->info.len, - bp->info.type); - } else - thread->debugreg[pos] = 0; + int i; + struct thread_struct *t = &tsk->thread; + + for (i = 0; i < HBP_NUM; i++) { + unregister_hw_breakpoint(t->ptrace_bps[i]); + t->ptrace_bps[i] = NULL; + } } -void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) +#ifdef CONFIG_KVM +void hw_breakpoint_restore(void) { - int i; - struct thread_struct *thread = &(tsk->thread); - - thread->debugreg7 = 0; - for (i = 0; i < HBP_NUM; i++) - thread->debugreg[i] = 0; + set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); + set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); + set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); + set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(__get_cpu_var(dr7), 7); } +EXPORT_SYMBOL_GPL(hw_breakpoint_restore); +#endif /* * Handle debug exception notifications. @@ -313,7 +421,7 @@ void arch_flush_thread_hw_breakpoint(struct task_struct *tsk) static int __kprobes hw_breakpoint_handler(struct die_args *args) { int i, cpu, rc = NOTIFY_STOP; - struct hw_breakpoint *bp; + struct perf_event *bp; unsigned long dr7, dr6; unsigned long *dr6_p; @@ -325,10 +433,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; - /* Lazy debug register switching */ - if (!test_tsk_thread_flag(current, TIF_DEBUG)) - arch_uninstall_thread_hw_breakpoint(); - get_debugreg(dr7, 7); /* Disable breakpoints during exception handling */ set_debugreg(0UL, 7); @@ -344,17 +448,18 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) for (i = 0; i < HBP_NUM; ++i) { if (likely(!(dr6 & (DR_TRAP0 << i)))) continue; + /* - * Find the corresponding hw_breakpoint structure and - * invoke its triggered callback. + * The counter may be concurrently released but that can only + * occur from a call_rcu() path. We can then safely fetch + * the breakpoint, use its callback, touch its counter + * while we are in an rcu_read_lock() path. */ - if (i >= hbp_kernel_pos) - bp = per_cpu(this_hbp_kernel[i], cpu); - else { - bp = current->thread.hbp[i]; - if (bp) - rc = NOTIFY_DONE; - } + rcu_read_lock(); + + bp = per_cpu(bp_per_reg[i], cpu); + if (bp) + rc = NOTIFY_DONE; /* * Reset the 'i'th TRAP bit in dr6 to denote completion of * exception handling @@ -362,19 +467,23 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) (*dr6_p) &= ~(DR_TRAP0 << i); /* * bp can be NULL due to lazy debug register switching - * or due to the delay between updates of hbp_kernel_pos - * and this_hbp_kernel. + * or due to concurrent perf counter removing. */ - if (!bp) - continue; + if (!bp) { + rcu_read_unlock(); + break; + } + + (bp->callback)(bp, args->regs); - (bp->triggered)(bp, args->regs); + rcu_read_unlock(); } if (dr6 & (~DR_TRAP_BITS)) rc = NOTIFY_DONE; set_debugreg(dr7, 7); put_cpu(); + return rc; } @@ -389,3 +498,13 @@ int __kprobes hw_breakpoint_exceptions_notify( return hw_breakpoint_handler(data); } + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ + /* TODO */ +} + +void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) +{ + /* TODO */ +} diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index cf8ee0016307..744508e7cfdd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -18,7 +19,6 @@ #include #include #include -#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -47,8 +47,6 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) - flush_thread_hw_breakpoint(tsk); WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } @@ -107,8 +105,7 @@ void flush_thread(void) } #endif - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) - flush_thread_hw_breakpoint(tsk); + flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 209e74801763..d5bd3132ee70 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -59,7 +59,6 @@ #include #include #include -#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -264,9 +263,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.io_bitmap_ptr = NULL; tsk = current; err = -ENOMEM; - if (unlikely(test_tsk_thread_flag(tsk, TIF_DEBUG))) - if (copy_thread_hw_breakpoint(tsk, p, clone_flags)) - goto out; + + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, @@ -287,13 +285,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); -out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } - if (err) - flush_thread_hw_breakpoint(p); clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); p->thread.ds_ctx = NULL; @@ -437,23 +432,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) lazy_load_gs(next->gs); percpu_write(current_task, next_p); - /* - * There's a problem with moving the arch_install_thread_hw_breakpoint() - * call before current is updated. Suppose a kernel breakpoint is - * triggered in between the two, the hw-breakpoint handler will see that - * the 'current' task does not have TIF_DEBUG flag set and will think it - * is leftover from an old task (lazy switching) and will erase it. Then - * until the next context switch, no user-breakpoints will be installed. - * - * The real problem is that it's impossible to update both current and - * physical debug registers at the same instant, so there will always be - * a window in which they disagree and a breakpoint might get triggered. - * Since we use lazy switching, we are forced to assume that a - * disagreement means that current is correct and the exception is due - * to lazy debug register switching. - */ - if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) - arch_install_thread_hw_breakpoint(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 72edac026a78..5bafdec34441 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -53,7 +53,6 @@ #include #include #include -#include asmlinkage extern void ret_from_fork(void); @@ -244,8 +243,6 @@ void release_thread(struct task_struct *dead_task) BUG(); } } - if (unlikely(dead_task->thread.debugreg7)) - flush_thread_hw_breakpoint(dead_task); } static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) @@ -309,9 +306,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, savesegment(ds, p->thread.ds); err = -ENOMEM; - if (unlikely(test_tsk_thread_flag(me, TIF_DEBUG))) - if (copy_thread_hw_breakpoint(me, p, clone_flags)) - goto out; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); @@ -351,8 +346,6 @@ out: kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } - if (err) - flush_thread_hw_breakpoint(p); return err; } @@ -508,23 +501,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (preload_fpu) __math_state_restore(); - /* - * There's a problem with moving the arch_install_thread_hw_breakpoint() - * call before current is updated. Suppose a kernel breakpoint is - * triggered in between the two, the hw-breakpoint handler will see that - * the 'current' task does not have TIF_DEBUG flag set and will think it - * is leftover from an old task (lazy switching) and will erase it. Then - * until the next context switch, no user-breakpoints will be installed. - * - * The real problem is that it's impossible to update both current and - * physical debug registers at the same instant, so there will always be - * a window in which they disagree and a breakpoint might get triggered. - * Since we use lazy switching, we are forced to assume that a - * disagreement means that current is correct and the exception is due - * to lazy debug register switching. - */ - if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG))) - arch_install_thread_hw_breakpoint(next_p); return prev_p; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 267cb85b479c..e79610d95971 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include @@ -441,54 +443,59 @@ static int genregs_set(struct task_struct *target, return ret; } -/* - * Decode the length and type bits for a particular breakpoint as - * stored in debug register 7. Return the "enabled" status. - */ -static int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, - unsigned *type) -{ - int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); - - *len = (bp_info & 0xc) | 0x40; - *type = (bp_info & 0x3) | 0x80; - return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; -} - -static void ptrace_triggered(struct hw_breakpoint *bp, struct pt_regs *regs) +static void ptrace_triggered(struct perf_event *bp, void *data) { - struct thread_struct *thread = &(current->thread); int i; + struct thread_struct *thread = &(current->thread); /* * Store in the virtual DR6 register the fact that the breakpoint * was hit so the thread's debugger will see it. */ - for (i = 0; i < hbp_kernel_pos; i++) - /* - * We will check bp->info.address against the address stored in - * thread's hbp structure and not debugreg[i]. This is to ensure - * that the corresponding bit for 'i' in DR7 register is enabled - */ - if (bp->info.address == thread->hbp[i]->info.address) + for (i = 0; i < HBP_NUM; i++) { + if (thread->ptrace_bps[i] == bp) break; + } thread->debugreg6 |= (DR_TRAP0 << i); } +/* + * Walk through every ptrace breakpoints for this thread and + * build the dr7 value on top of their attributes. + * + */ +static unsigned long ptrace_get_dr7(struct perf_event *bp[]) +{ + int i; + int dr7 = 0; + struct arch_hw_breakpoint *info; + + for (i = 0; i < HBP_NUM; i++) { + if (bp[i] && !bp[i]->attr.disabled) { + info = counter_arch_bp(bp[i]); + dr7 |= encode_dr7(i, info->len, info->type); + } + } + + return dr7; +} + /* * Handle ptrace writes to debug register 7. */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { struct thread_struct *thread = &(tsk->thread); - unsigned long old_dr7 = thread->debugreg7; + unsigned long old_dr7; int i, orig_ret = 0, rc = 0; int enabled, second_pass = 0; unsigned len, type; - struct hw_breakpoint *bp; + int gen_len, gen_type; + struct perf_event *bp; data &= ~DR_CONTROL_RESERVED; + old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: /* * Loop through all the hardware breakpoints, making the @@ -496,11 +503,12 @@ restore: */ for (i = 0; i < HBP_NUM; i++) { enabled = decode_dr7(data, i, &len, &type); - bp = thread->hbp[i]; + bp = thread->ptrace_bps[i]; if (!enabled) { if (bp) { - /* Don't unregister the breakpoints right-away, + /* + * Don't unregister the breakpoints right-away, * unless all register_user_hw_breakpoint() * requests have succeeded. This prevents * any window of opportunity for debug @@ -508,27 +516,45 @@ restore: */ if (!second_pass) continue; - unregister_user_hw_breakpoint(tsk, bp); - kfree(bp); + thread->ptrace_bps[i] = NULL; + unregister_hw_breakpoint(bp); } continue; } + + /* + * We shoud have at least an inactive breakpoint at this + * slot. It means the user is writing dr7 without having + * written the address register first + */ if (!bp) { - rc = -ENOMEM; - bp = kzalloc(sizeof(struct hw_breakpoint), GFP_KERNEL); - if (bp) { - bp->info.address = thread->debugreg[i]; - bp->triggered = ptrace_triggered; - bp->info.len = len; - bp->info.type = type; - rc = register_user_hw_breakpoint(tsk, bp); - if (rc) - kfree(bp); - } - } else - rc = modify_user_hw_breakpoint(tsk, bp); + rc = -EINVAL; + break; + } + + rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type); if (rc) break; + + /* + * This is a temporary thing as bp is unregistered/registered + * to simulate modification + */ + bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len, + gen_type, bp->callback, + tsk, true); + thread->ptrace_bps[i] = NULL; + + if (!bp) { /* incorrect bp, or we have a bug in bp API */ + rc = -EINVAL; + break; + } + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + bp = NULL; + break; + } + thread->ptrace_bps[i] = bp; } /* * Make a second pass to free the remaining unused breakpoints @@ -553,15 +579,63 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) struct thread_struct *thread = &(tsk->thread); unsigned long val = 0; - if (n < HBP_NUM) - val = thread->debugreg[n]; - else if (n == 6) + if (n < HBP_NUM) { + struct perf_event *bp; + bp = thread->ptrace_bps[n]; + if (!bp) + return 0; + val = bp->hw.info.address; + } else if (n == 6) { val = thread->debugreg6; - else if (n == 7) - val = thread->debugreg7; + } else if (n == 7) { + val = ptrace_get_dr7(thread->ptrace_bps); + } return val; } +static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, + unsigned long addr) +{ + struct perf_event *bp; + struct thread_struct *t = &tsk->thread; + + if (!t->ptrace_bps[nr]) { + /* + * Put stub len and type to register (reserve) an inactive but + * correct bp + */ + bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1, + HW_BREAKPOINT_W, + ptrace_triggered, tsk, + false); + } else { + bp = t->ptrace_bps[nr]; + t->ptrace_bps[nr] = NULL; + bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len, + bp->attr.bp_type, + bp->callback, + tsk, + bp->attr.disabled); + } + + if (!bp) + return -EIO; + /* + * CHECKME: the previous code returned -EIO if the addr wasn't a + * valid task virtual addr. The new one will return -EINVAL in this + * case. + * -EINVAL may be what we want for in-kernel breakpoints users, but + * -EIO looks better for ptrace, since we refuse a register writing + * for the user. And anyway this is the previous behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; + + return 0; +} + /* * Handle PTRACE_POKEUSR calls for the debug register area. */ @@ -575,19 +649,13 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) return -EIO; if (n == 6) { - tsk->thread.debugreg6 = val; + thread->debugreg6 = val; goto ret_path; } if (n < HBP_NUM) { - if (thread->hbp[n]) { - if (arch_check_va_in_userspace(val, - thread->hbp[n]->info.len) == 0) { - rc = -EIO; - goto ret_path; - } - thread->hbp[n]->info.address = val; - } - thread->debugreg[n] = val; + rc = ptrace_set_breakpoint_addr(tsk, n, val); + if (rc) + return rc; } /* All that's left is DR7 */ if (n == 7) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 213a7a3e4562..565ebc65920e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -64,7 +64,6 @@ #include #include #include -#include #include #include @@ -328,7 +327,6 @@ notrace static void __cpuinit start_secondary(void *unused) x86_cpuinit.setup_percpu_clockev(); wmb(); - load_debug_registers(); cpu_idle(); } @@ -1269,7 +1267,6 @@ void cpu_disable_common(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(); - hw_breakpoint_disable(); } int native_cpu_disable(void) -- cgit v1.2.1 From eb647138acefc897c0eb6eddd5d3650966dfe627 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Sun, 8 Nov 2009 12:12:14 +0100 Subject: x86/PCI: Adjust GFP mask handling for coherent allocations Rather than forcing GFP flags and DMA mask to be inconsistent, GFP flags should be determined even for the fallback device through dma_alloc_coherent_mask()/dma_alloc_coherent_gfp_flags(). This restores 64-bit behavior as it was prior to commits 8965eb19386fdf5ccd0ef8b02593eb8560aa3416 and 4a367f3a9dbf2e7ffcee4702203479809236ee6e (not sure why there are two of them), where GFP_DMA was forced on for 32-bit, but not for 64-bit, with the slight adjustment that afaict even 32-bit doesn't need this without CONFIG_ISA. Signed-off-by: Jan Beulich Acked-by: Takashi Iwai LKML-Reference: <4AF18187020000780001D8AA@vpn.id2.novell.com> Signed-off-by: Ingo Molnar Signed-off-by: Jesse Barnes --- arch/x86/kernel/pci-dma.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index b2a71dca5642..a6e804d16c35 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -45,12 +45,10 @@ int iommu_pass_through __read_mostly; dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); -/* Dummy device used for NULL arguments (normally ISA). Better would - be probably a smaller DMA mask, but this is bug-to-bug compatible - to older i386. */ +/* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", - .coherent_dma_mask = DMA_BIT_MASK(32), + .coherent_dma_mask = ISA_DMA_BIT_MASK, .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; EXPORT_SYMBOL(x86_dma_fallback_dev); -- cgit v1.2.1 From 46dc281b1bb02527195fe2ad50a3af6d7f7f7325 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 8 Nov 2009 18:53:56 +0300 Subject: x86, apic: Use PAGE_SIZE instead of numbers The whole page is reserved for IO-APIC fixmap due to non-cacheable requirement. So lets note this explicitly instead of playing with numbers. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu Cc: Maciej W. Rozycki LKML-Reference: <20091108155356.GB25940@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 31e9db3c12ad..9ee1c1628c17 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4111,7 +4111,7 @@ fake_ioapic_page: idx++; ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res->end = ioapic_phys + PAGE_SIZE-1; ioapic_res++; } } -- cgit v1.2.1 From 4343fe1024e09e17667f95620ed3e69a7a5f4389 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 8 Nov 2009 18:54:31 +0300 Subject: x86, ioapic: Use snrpintf while set names for IO-APIC resourses We should be ready that one day MAX_IO_APICS may raise its number. To prevent memory overwrite we're to use safe snprintf while set IO-APIC resourse name. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20091108155431.GC25940@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9ee1c1628c17..24d1458a1822 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4066,7 +4066,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) for (i = 0; i < nr_ioapics; i++) { res[i].name = mem; res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; } -- cgit v1.2.1 From f4a70c55376683213229af7266dc57ad81aee354 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 8 Nov 2009 16:16:45 +0300 Subject: x86, apic: Get rid of apicid_to_cpu_present assign on 64-bit In fact it's never get used on x86-64 (for 64 bit platform we use differ technique to enumerate io-units). Reported-by: Stephen Rothwell Signed-off-by: Cyrill Gorcunov Cc: Peter Zijlstra LKML-Reference: <20091108131645.GD5300@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_noop.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 9ab6ffb313ac..89629f622b60 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -162,7 +162,12 @@ struct apic apic_noop = { .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, + +#ifdef CONFIG_X86_32 .apicid_to_cpu_present = default_apicid_to_cpu_present, +#else + .apicid_to_cpu_present = NULL, +#endif .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, -- cgit v1.2.1 From 506f90eeae682dc96c11c7aefac0262b3a560b49 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 29 Oct 2009 14:45:52 +0100 Subject: x86, amd-ucode: Check UCODE_MAGIC before loading the container file Signed-off-by: Borislav Petkov Signed-off-by: Andreas Herrmann LKML-Reference: <20091029134552.GC30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 366baa179913..f4c538b681ca 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -317,6 +317,12 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) return UCODE_NFOUND; } + if (*(u32 *)firmware->data != UCODE_MAGIC) { + printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", + *(u32 *)firmware->data); + return UCODE_ERROR; + } + ret = generic_load_microcode(cpu, firmware->data, firmware->size); release_firmware(firmware); -- cgit v1.2.1 From 6e18da75c28b592594fd632cf3e6eb09d3d078de Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 29 Oct 2009 14:47:42 +0100 Subject: x86, amd-ucode: Remove needless log messages Signed-off-by: Andreas Herrmann Cc: Borislav Petkov LKML-Reference: <20091029134742.GD30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index f4c538b681ca..c043534fd986 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -109,12 +109,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 0; } - if (mc_header->processor_rev_id != equiv_cpu_id) { - printk(KERN_ERR "microcode: CPU%d: patch mismatch " - "(processor_rev_id: %x, equiv_cpu_id: %x)\n", - cpu, mc_header->processor_rev_id, equiv_cpu_id); + if (mc_header->processor_rev_id != equiv_cpu_id) return 0; - } /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { @@ -185,9 +181,6 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); - printk(KERN_DEBUG "microcode: size %u, total_size %u\n", - size, total_size); - if (total_size > size || total_size > UCODE_MAX_SIZE) { printk(KERN_ERR "microcode: error: size mismatch\n"); return NULL; -- cgit v1.2.1 From 7abc07531383ac7f727cc9d44e1360a829f2082e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 10 Nov 2009 01:06:59 +0300 Subject: x86: apic: Do not use stacked physid_mask_t We should not use physid_mask_t as a stack based variable in apic code. This type depends on MAX_APICS parameter which may be huge enough. Especially it became a problem with apic NOOP driver which is portable between 32 bit and 64 bit environment (where we have really huge MAX_APICS). So apic driver should operate with pointers and a caller in turn should aware of allocation physid_mask_t variable. As a side (but positive) effect -- we may use already implemented physid_set_mask_of_physid function eliminating default_apicid_to_cpu_present completely. Note that physids_coerce and physids_promote turned into static inline from macro (since macro hides the fact that parameter is being interpreted as unsigned long, make it explicit). Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu Cc: Maciej W. Rozycki Cc: Stephen Rothwell LKML-Reference: <20091109220659.GA5568@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_noop.c | 18 ++++-------------- arch/x86/kernel/apic/bigsmp_32.c | 13 ++++--------- arch/x86/kernel/apic/es7000_32.c | 16 ++++++---------- arch/x86/kernel/apic/io_apic.c | 14 +++++++------- arch/x86/kernel/apic/numaq_32.c | 13 ++++++------- arch/x86/kernel/apic/probe_32.c | 2 +- arch/x86/kernel/apic/summit_32.c | 10 +++++----- arch/x86/kernel/visws_quirks.c | 2 +- 8 files changed, 34 insertions(+), 54 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 89629f622b60..d9acc3bee0f4 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void) return 0; } -static physid_mask_t noop_ioapic_phys_id_map(physid_mask_t phys_map) -{ - return phys_map; -} - static int noop_cpu_to_logical_apicid(int cpu) { return 0; @@ -100,9 +95,9 @@ static const struct cpumask *noop_target_cpus(void) return cpumask_of(0); } -static unsigned long noop_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid) { - return physid_isset(apicid, bitmap); + return physid_isset(apicid, *map); } static unsigned long noop_check_apicid_present(int bit) @@ -155,19 +150,14 @@ struct apic apic_noop = { .vector_allocation_domain = noop_vector_allocation_domain, .init_apic_ldr = noop_init_apic_ldr, - .ioapic_phys_id_map = noop_ioapic_phys_id_map, + .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, .multi_timer_check = NULL, .apicid_to_node = noop_apicid_to_node, .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, - -#ifdef CONFIG_X86_32 - .apicid_to_cpu_present = default_apicid_to_cpu_present, -#else - .apicid_to_cpu_present = NULL, -#endif + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 77a06413b6b2..38dcecfa5818 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void) #endif } -static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } @@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid) -{ - return physid_mask_of_physid(phys_apicid); -} - /* Mapping from cpu number to logical apicid */ static inline int bigsmp_cpu_to_logical_apicid(int cpu) { @@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu) return cpu_physical_id(cpu); } -static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) +static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0xFFL); + physids_promote(0xFFL, retmap); } static int bigsmp_check_phys_apicid_present(int phys_apicid) @@ -230,7 +225,7 @@ struct apic apic_bigsmp = { .apicid_to_node = bigsmp_apicid_to_node, .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, - .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = bigsmp_check_phys_apicid_present, .enable_apic_mode = NULL, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 89174f847b49..e85f8fb7f8e7 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -466,11 +466,11 @@ static const struct cpumask *es7000_target_cpus(void) return cpumask_of(smp_processor_id()); } -static unsigned long -es7000_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } + static unsigned long es7000_check_apicid_present(int bit) { return physid_isset(bit, phys_cpu_present_map); @@ -539,14 +539,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu) static int cpu_id; -static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) +static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) { - physid_mask_t mask; - - mask = physid_mask_of_physid(cpu_id); + physid_set_mask_of_physid(cpu_id, retmap); ++cpu_id; - - return mask; } /* Mapping from cpu number to logical apicid */ @@ -561,10 +557,10 @@ static int es7000_cpu_to_logical_apicid(int cpu) #endif } -static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) +static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0xff); + physids_promote(0xFFL, retmap); } static int es7000_check_phys_apicid_present(int cpu_physical_apicid) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 24d1458a1822..20ea8392bc57 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2031,7 +2031,7 @@ void __init setup_ioapic_ids_from_mpc(void) * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ - phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); /* * Set the IOAPIC ID to the value stored in the MPC table. @@ -2058,7 +2058,7 @@ void __init setup_ioapic_ids_from_mpc(void) * system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(phys_id_present_map, + if (apic->check_apicid_used(&phys_id_present_map, mp_ioapics[apic_id].apicid)) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", apic_id, mp_ioapics[apic_id].apicid); @@ -2073,7 +2073,7 @@ void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic_id].apicid = i; } else { physid_mask_t tmp; - tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); + apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", mp_ioapics[apic_id].apicid); @@ -3904,7 +3904,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) */ if (physids_empty(apic_id_map)) - apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); + apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); @@ -3920,10 +3920,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) * Every APIC in a system must have a unique ID or we get lots of nice * 'stuck on smp_invalidate_needed IPI wait' messages. */ - if (apic->check_apicid_used(apic_id_map, apic_id)) { + if (apic->check_apicid_used(&apic_id_map, apic_id)) { for (i = 0; i < get_physical_broadcast(); i++) { - if (!apic->check_apicid_used(apic_id_map, i)) + if (!apic->check_apicid_used(&apic_id_map, i)) break; } @@ -3936,7 +3936,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) apic_id = i; } - tmp = apic->apicid_to_cpu_present(apic_id); + apic->apicid_to_cpu_present(apic_id, &tmp); physids_or(apic_id_map, apic_id_map, tmp); if (reg_00.bits.ID != apic_id) { diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index efa00e2b8505..07cdbdcd7a92 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -334,10 +334,9 @@ static inline const struct cpumask *numaq_target_cpus(void) return cpu_all_mask; } -static inline unsigned long -numaq_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid) { - return physid_isset(apicid, bitmap); + return physid_isset(apicid, *map); } static inline unsigned long numaq_check_apicid_present(int bit) @@ -371,10 +370,10 @@ static inline int numaq_multi_timer_check(int apic, int irq) return apic != 0 && irq == 0; } -static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) +static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* We don't have a good way to do this yet - hack */ - return physids_promote(0xFUL); + return physids_promote(0xFUL, retmap); } static inline int numaq_cpu_to_logical_apicid(int cpu) @@ -402,12 +401,12 @@ static inline int numaq_apicid_to_node(int logical_apicid) return logical_apicid >> 4; } -static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) +static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) { int node = numaq_apicid_to_node(logical_apicid); int cpu = __ffs(logical_apicid & 0xf); - return physid_mask_of_physid(cpu + 4*node); + physid_set_mask_of_physid(cpu + 4*node, retmap); } /* Where the IO area was mapped on multiquad, always 0 otherwise */ diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 0c0182cc947d..1a6559f6768c 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -108,7 +108,7 @@ struct apic apic_default = { .apicid_to_node = default_apicid_to_node, .cpu_to_logical_apicid = default_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, - .apicid_to_cpu_present = default_apicid_to_cpu_present, + .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, .enable_apic_mode = NULL, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 645ecc4ff0be..9b419263d90d 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void) return cpumask_of(0); } -static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) +static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid) { return 0; } @@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) +static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ - return physids_promote(0x0F); + physids_promote(0x0FL, retmap); } -static physid_mask_t summit_apicid_to_cpu_present(int apicid) +static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap) { - return physid_mask_of_physid(0); + physid_set_mask_of_physid(0, retmap); } static int summit_check_phys_apicid_present(int physical_apicid) diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index f068553a1b17..cff70c86e18e 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -183,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - apic_cpus = apic->apicid_to_cpu_present(m->apicid); + apic->apicid_to_cpu_present(m->apicid, &apic_cpus); physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); /* * Validate version -- cgit v1.2.1 From a2202aa29289db64ca7988b12343158b67b27f10 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Tue, 10 Nov 2009 09:38:24 +0800 Subject: x86: Under BIOS control, restore AP's APIC_LVTTHMR to the BSP value On platforms where the BIOS handles the thermal monitor interrupt, APIC_LVTTHMR on each logical CPU is programmed to generate a SMI and OS must not touch it. Unfortunately AP bringup sequence using INIT-SIPI-SIPI clears all the LVT entries except the mask bit. Essentially this results in all LVT entries including the thermal monitoring interrupt set to masked (clearing the bios programmed value for APIC_LVTTHMR). And this leads to kernel take over the thermal monitoring interrupt on AP's but not on BSP (leaving the bios programmed value only on BSP). As a result of this, we have seen system hangs when the thermal monitoring interrupt is generated. Fix this by reading the initial value of thermal LVT entry on BSP and if bios has taken over the control, then program the same value on all AP's and leave the thermal monitoring interrupt control on all the logical cpu's to the bios. Signed-off-by: Yong Wang Reviewed-by: Suresh Siddha Cc: Borislav Petkov Cc: Arjan van de Ven LKML-Reference: <20091110013824.GA24940@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar Cc: stable@kernel.org --- arch/x86/kernel/cpu/common.c | 2 -- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- arch/x86/kernel/cpu/mcheck/therm_throt.c | 29 ++++++++++++++++++++++++++++- arch/x86/kernel/setup.c | 3 +++ 4 files changed, 34 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4df69a38be57..9053be5d95cd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -837,10 +837,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } -#ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_cpu_init(c); -#endif select_idle_routine(c); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 80801705edd7..0d4102031a4c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1655,13 +1655,14 @@ static int __init mcheck_enable(char *str) } __setup("mce", mcheck_enable); -static int __init mcheck_init(void) +int __init mcheck_init(void) { atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); + mcheck_intel_therm_init(); + return 0; } -early_initcall(mcheck_init); /* * Sysfs support diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index b3a1dba75330..7f3cf36ed124 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); +static u32 lvtthmr_init __read_mostly; + #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) @@ -254,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } +void mcheck_intel_therm_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) && + cpu_has(&boot_cpu_data, X86_FEATURE_ACC)) + lvtthmr_init = apic_read(APIC_LVTTHMR); +} + void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); @@ -270,7 +284,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); + + /* + * The initial value of thermal LVT entries on all APs always reads + * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI + * sequence to them and LVT registers are reset to 0s except for + * the mask bits which are set to 1s when APs receive INIT IPI. + * Always restore the value that BIOS has programmed on AP based on + * BSP's info we saved since BIOS is always setting the same value + * for all threads/cores + */ + apic_write(APIC_LVTTHMR, lvtthmr_init); + + h = lvtthmr_init; + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e09f0e2c14b5..179c1f2aa457 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -109,6 +109,7 @@ #ifdef CONFIG_X86_64 #include #endif +#include /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -1024,6 +1025,8 @@ void __init setup_arch(char **cmdline_p) #endif #endif x86_init.oem.banner(); + + mcheck_init(); } #ifdef CONFIG_X86_32 -- cgit v1.2.1 From 41855b77547fa18d90ed6a5d322983d3fdab1959 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 9 Nov 2009 17:58:50 -0800 Subject: x86: GART: pci-gart_64.c: Use correct length in strncmp Signed-off-by: Joe Perches Cc: # .3x.x LKML-Reference: <1257818330.12852.72.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index a9bcdf7c8801..eb46ab3f52b2 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -858,7 +858,7 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush", 8)) + if (!strncmp(p, "fullflush", 9)) iommu_fullflush = 1; if (!strncmp(p, "nofullflush", 11)) iommu_fullflush = 0; -- cgit v1.2.1 From 9f6b3c2c30cfbb1166ce7e74a8f9fd93ae19d2de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 9 Nov 2009 21:03:43 +0100 Subject: hw-breakpoints: Fix broken a.out format dump Fix the broken a.out format dump. For now we only dump the ptrace breakpoints. TODO: Dump every perf breakpoints for the current thread, not only ptrace based ones. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: "K. Prasad" --- arch/x86/kernel/hw_breakpoint.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index e622620790bd..57dcee5fa958 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -375,6 +375,41 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return 0; } +/* + * Dump the debug register contents to the user. + * We can't dump our per cpu values because it + * may contain cpu wide breakpoint, something that + * doesn't belong to the current task. + * + * TODO: include non-ptrace user breakpoints (perf) + */ +void aout_dump_debugregs(struct user *dump) +{ + int i; + int dr7 = 0; + struct perf_event *bp; + struct arch_hw_breakpoint *info; + struct thread_struct *thread = ¤t->thread; + + for (i = 0; i < HBP_NUM; i++) { + bp = thread->ptrace_bps[i]; + + if (bp && !bp->attr.disabled) { + dump->u_debugreg[i] = bp->attr.bp_addr; + info = counter_arch_bp(bp); + dr7 |= encode_dr7(i, info->len, info->type); + } else { + dump->u_debugreg[i] = 0; + } + } + + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + + dump->u_debugreg[7] = dr7; +} + /* * Release the user breakpoints used by ptrace */ -- cgit v1.2.1 From d1c84f79a6ba992dc01e312c44a21496303874d6 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 10 Nov 2009 12:07:23 +0100 Subject: x86: ucode-amd: Load ucode-patches once and not separately of each CPU This also implies that corresponding log messages, e.g. platform microcode: firmware: requesting amd-ucode/microcode_amd.bin show up only once on module load and not when ucode is updated for each CPU. Signed-off-by: Andreas Herrmann Cc: dimm LKML-Reference: <20091110110723.GH30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 24 +++++++++++++++++------- arch/x86/kernel/microcode_core.c | 6 ++++++ 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index c043534fd986..75538f647193 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -33,6 +33,8 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 +const struct firmware *firmware; + struct equiv_cpu_entry { u32 installed_cpu; u32 fixed_errata_mask; @@ -301,14 +303,10 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) static enum ucode_state request_microcode_fw(int cpu, struct device *device) { - const char *fw_name = "amd-ucode/microcode_amd.bin"; - const struct firmware *firmware; enum ucode_state ret; - if (request_firmware(&firmware, fw_name, device)) { - printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); + if (firmware == NULL) return UCODE_NFOUND; - } if (*(u32 *)firmware->data != UCODE_MAGIC) { printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", @@ -318,8 +316,6 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) ret = generic_load_microcode(cpu, firmware->data, firmware->size); - release_firmware(firmware); - return ret; } @@ -339,7 +335,21 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } +void init_microcode_amd(struct device *device) +{ + const char *fw_name = "amd-ucode/microcode_amd.bin"; + if (request_firmware(&firmware, fw_name, device)) + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); +} + +void fini_microcode_amd(void) +{ + release_firmware(firmware); +} + static struct microcode_ops microcode_amd_ops = { + .init = init_microcode_amd, + .fini = fini_microcode_amd, .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 378e9a8f1bf8..d2a816021d9f 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -520,6 +520,9 @@ static int __init microcode_init(void) return PTR_ERR(microcode_pdev); } + if (microcode_ops->init) + microcode_ops->init(µcode_pdev->dev); + get_online_cpus(); mutex_lock(µcode_mutex); @@ -563,6 +566,9 @@ static void __exit microcode_exit(void) platform_device_unregister(microcode_pdev); + if (microcode_ops->fini) + microcode_ops->fini(); + microcode_ops = NULL; pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); -- cgit v1.2.1 From 14c569425a0ae12cbeed72fdb8ebe78c48455dfd Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 10 Nov 2009 12:08:25 +0100 Subject: x86: ucode-amd: Don't warn when no ucode is available for a CPU revision There is no point in warning when there is no ucode available for a specific CPU revision. Currently the container-file, which provides the AMD ucode patches for OS load, contains only a few ucode patches. It's already clearly indicated by the printed patch_level whenever new ucode was available and an update happened. So the warning message is of no help but rather annoying on systems with many CPUs. Signed-off-by: Andreas Herrmann Cc: dimm LKML-Reference: <20091110110825.GI30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 75538f647193..9f13324054ce 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -105,11 +105,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) i++; } - if (!equiv_cpu_id) { - printk(KERN_WARNING "microcode: CPU%d: cpu revision " - "not listed in equivalent cpu table\n", cpu); + if (!equiv_cpu_id) return 0; - } if (mc_header->processor_rev_id != equiv_cpu_id) return 0; -- cgit v1.2.1 From 1a74357066369be91e6f4f431621a00b052df964 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 10 Nov 2009 12:09:20 +0100 Subject: x86: ucode-amd: Convert printk(KERN_*...) to pr_*(...) Signed-off-by: Andreas Herrmann Cc: dimm LKML-Reference: <20091110110920.GJ30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 9f13324054ce..26e33bd8485b 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -78,12 +78,12 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) memset(csig, 0, sizeof(*csig)); if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " - "supported\n", cpu, c->x86); + pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " + "supported\n", cpu, c->x86); return -1; } rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } @@ -113,7 +113,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - printk(KERN_ERR "microcode: CPU%d: loading of chipset " + pr_err(KERN_ERR "microcode: CPU%d: loading of chipset " "specific code not yet supported\n", cpu); return 0; } @@ -143,14 +143,12 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - printk(KERN_ERR "microcode: CPU%d: update failed " + pr_err("microcode: CPU%d: update failed " "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); return -1; } - printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", - cpu, rev); - + pr_info("microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; @@ -173,7 +171,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - printk(KERN_ERR "microcode: error: invalid type field in " + pr_err("microcode: error: invalid type field in " "container file section header\n"); return NULL; } @@ -181,7 +179,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); if (total_size > size || total_size > UCODE_MAX_SIZE) { - printk(KERN_ERR "microcode: error: size mismatch\n"); + pr_err("microcode: error: size mismatch\n"); return NULL; } @@ -210,15 +208,14 @@ static int install_equiv_cpu_table(const u8 *buf) size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - printk(KERN_ERR "microcode: error: invalid type field in " + pr_err("microcode: error: invalid type field in " "container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - printk(KERN_ERR "microcode: failed to allocate " - "equivalent CPU table\n"); + pr_err("microcode: failed to allocate equivalent CPU table\n"); return 0; } @@ -251,8 +248,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - printk(KERN_ERR "microcode: failed to create " - "equivalent cpu table\n"); + pr_err("microcode: failed to create equivalent cpu table\n"); return UCODE_ERROR; } @@ -306,7 +302,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) return UCODE_NFOUND; if (*(u32 *)firmware->data != UCODE_MAGIC) { - printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", + pr_err("microcode: invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)firmware->data); return UCODE_ERROR; } @@ -319,8 +315,8 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { - printk(KERN_INFO "microcode: AMD microcode update via " - "/dev/cpu/microcode not supported\n"); + pr_info("microcode: AMD microcode update via " + "/dev/cpu/microcode not supported\n"); return UCODE_ERROR; } @@ -336,7 +332,7 @@ void init_microcode_amd(struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; if (request_firmware(&firmware, fw_name, device)) - printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); + pr_err("microcode: failed to load file %s\n", fw_name); } void fini_microcode_amd(void) -- cgit v1.2.1 From d07c1be0693e0902d743160b8b638585b808f8ac Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:12 +0900 Subject: x86: Add iommu_init to x86_init_ops We call the detections functions of all the IOMMUs then all their initialization functions. The latter is pointless since we don't detect multiple different IOMMUs. What we need to do is calling the initialization function of the detected IOMMU. This adds iommu_init hook to x86_init_ops so if an IOMMU detection function can set its initialization function to the hook. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 2 ++ arch/x86/kernel/x86_init.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 839d49a669bc..a13478da533c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -292,6 +292,8 @@ static int __init pci_iommu_init(void) dma_debug_add_bus(&pci_bus_type); #endif + x86_init.iommu.iommu_init(); + calgary_iommu_init(); intel_iommu_init(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index bc9b230ef402..c46984d122dc 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -19,6 +19,7 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } +int __init iommu_init_noop(void) { return 0; } /* * The platform setup functions are preset with the default functions @@ -63,6 +64,10 @@ struct x86_init_ops x86_init __initdata = { .tsc_pre_init = x86_init_noop, .timer_init = hpet_time_init, }, + + .iommu = { + .iommu_init = iommu_init_noop, + }, }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { -- cgit v1.2.1 From d7b9f7be216b04ff9d108f856bc03d96e7b3439c Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:13 +0900 Subject: x86: Calgary: Convert detect_calgary() to use iommu_init hook This changes detect_calgary() to set init_calgary() to iommu_init hook if detect_calgary() finds the Calgary IOMMU. We can kill the code to check if we found the IOMMU in init_calgary() since detect_calgary() sets init_calgary() only when it found the IOMMU. Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com LKML-Reference: <1257849980-22640-3-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 11 +++++------ arch/x86/kernel/pci-dma.c | 2 -- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 971a3bec47a8..47bd419ea4d2 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -46,6 +46,7 @@ #include #include #include +#include #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; @@ -1344,6 +1345,8 @@ static void __init get_tce_space_from_tar(void) return; } +int __init calgary_iommu_init(void); + void __init detect_calgary(void) { int bus; @@ -1445,6 +1448,8 @@ void __init detect_calgary(void) /* swiotlb for devices that aren't behind the Calgary. */ if (max_pfn > MAX_DMA32_PFN) swiotlb = 1; + + x86_init.iommu.iommu_init = calgary_iommu_init; } return; @@ -1461,12 +1466,6 @@ int __init calgary_iommu_init(void) { int ret; - if (no_iommu || (swiotlb && !calgary_detected)) - return -ENODEV; - - if (!calgary_detected) - return -ENODEV; - /* ok, we're trying to use Calgary - let's roll */ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a13478da533c..0224da88256a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -294,8 +294,6 @@ static int __init pci_iommu_init(void) x86_init.iommu.iommu_init(); - calgary_iommu_init(); - intel_iommu_init(); amd_iommu_init(); -- cgit v1.2.1 From de957628ce7c84764ff41331111036b3ae5bad0f Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:14 +0900 Subject: x86: GART: Convert gart_iommu_hole_init() to use iommu_init hook This changes gart_iommu_hole_init() to set gart_iommu_init() to iommu_init hook if gart_iommu_hole_init() finds the GART IOMMU. We can kill the code to check if we found the IOMMU in gart_iommu_init() since gart_iommu_hole_init() sets gart_iommu_init() only when it found the IOMMU. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-4-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 2 ++ arch/x86/kernel/pci-dma.c | 2 -- arch/x86/kernel/pci-gart_64.c | 15 +++++---------- 3 files changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 128111d8ffe0..03933cf0b63c 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -28,6 +28,7 @@ #include #include #include +#include int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; @@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void) iommu_detected = 1; gart_iommu_aperture = 1; + x86_init.iommu.iommu_init = gart_iommu_init; aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0224da88256a..ecde8543537f 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -298,8 +298,6 @@ static int __init pci_iommu_init(void) amd_iommu_init(); - gart_iommu_init(); - no_iommu_init(); return 0; } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index eb46ab3f52b2..0410bd30060d 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -709,7 +709,7 @@ static void gart_iommu_shutdown(void) } } -void __init gart_iommu_init(void) +int __init gart_iommu_init(void) { struct agp_kern_info info; unsigned long iommu_start; @@ -719,7 +719,7 @@ void __init gart_iommu_init(void) long i; if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) - return; + return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; @@ -731,13 +731,6 @@ void __init gart_iommu_init(void) (agp_copy_info(agp_bridge, &info) < 0); #endif - if (swiotlb) - return; - - /* Did we detect a different HW IOMMU? */ - if (iommu_detected && !gart_iommu_aperture) - return; - if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || @@ -747,7 +740,7 @@ void __init gart_iommu_init(void) "but GART IOMMU not available.\n"); printk(KERN_WARNING "falling back to iommu=soft.\n"); } - return; + return 0; } /* need to map that range */ @@ -840,6 +833,8 @@ void __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; x86_platform.iommu_shutdown = gart_iommu_shutdown; + + return 0; } void __init gart_parse_options(char *p) -- cgit v1.2.1 From ea1b0d3945c7374849235b6ecaea1191ee1d9d50 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:15 +0900 Subject: x86: amd_iommu: Convert amd_iommu_detect() to use iommu_init hook This changes amd_iommu_detect() to set amd_iommu_init to iommu_init hook if amd_iommu_detect() finds the AMD IOMMU. We can kill the code to check if we found the IOMMU in amd_iommu_init() since amd_iommu_detect() sets amd_iommu_init() only when it found the IOMMU. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-5-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 17 +++-------------- arch/x86/kernel/pci-dma.c | 2 -- 2 files changed, 3 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6acd43e9afd7..c41aabddaa2a 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * definitions for the ACPI scanning code @@ -1176,19 +1177,10 @@ static struct sys_device device_amd_iommu = { * functions. Finally it prints some information about AMD IOMMUs and * the driver state and enables the hardware. */ -int __init amd_iommu_init(void) +static int __init amd_iommu_init(void) { int i, ret = 0; - - if (no_iommu) { - printk(KERN_INFO "AMD-Vi disabled by kernel command line\n"); - return 0; - } - - if (!amd_iommu_detected) - return -ENODEV; - /* * First parse ACPI tables to find the largest Bus/Dev/Func * we need to handle. Upon this information the shared data @@ -1344,10 +1336,7 @@ void __init amd_iommu_detect(void) if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; amd_iommu_detected = 1; -#ifdef CONFIG_GART_IOMMU - gart_iommu_aperture_disabled = 1; - gart_iommu_aperture = 0; -#endif + x86_init.iommu.iommu_init = amd_iommu_init; } } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index ecde8543537f..5ca44a9301a0 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -296,8 +296,6 @@ static int __init pci_iommu_init(void) intel_iommu_init(); - amd_iommu_init(); - no_iommu_init(); return 0; } -- cgit v1.2.1 From 9d5ce73a64be2be8112147a3e0b551ad9cd1247b Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:16 +0900 Subject: x86: intel-iommu: Convert detect_intel_iommu to use iommu_init hook This changes detect_intel_iommu() to set intel_iommu_init() to iommu_init hook if detect_intel_iommu() finds the IOMMU. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-6-git-send-email-fujita.tomonori@lab.ntt.co.jp> [ -v2: build fix for the !CONFIG_DMAR case ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 5ca44a9301a0..bed05e2e5890 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -294,8 +294,6 @@ static int __init pci_iommu_init(void) x86_init.iommu.iommu_init(); - intel_iommu_init(); - no_iommu_init(); return 0; } -- cgit v1.2.1 From ad32e8cb86e7894aac51c8963eaa9f36bb8a4e14 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:19 +0900 Subject: swiotlb: Defer swiotlb init printing, export swiotlb_print_info() This enables us to avoid printing swiotlb memory info when we initialize swiotlb. After swiotlb initialization, we could find that we don't need swiotlb. This patch removes the code to print swiotlb memory info in swiotlb_init() and exports the function to do that. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com Cc: tony.luck@intel.com Cc: benh@kernel.crashing.org LKML-Reference: <1257849980-22640-9-git-send-email-fujita.tomonori@lab.ntt.co.jp> [ -v2: merge up conflict ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index aaa6b7839f1e..ea20ef7ca523 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -52,8 +52,7 @@ void __init pci_swiotlb_init(void) if (swiotlb_force) swiotlb = 1; if (swiotlb) { - printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_init(); + swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } } -- cgit v1.2.1 From 75f1cdf1dda92cae037ec848ae63690d91913eac Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 19:46:20 +0900 Subject: x86: Handle HW IOMMU initialization failure gracefully If HW IOMMU initialization fails (Intel VT-d often does this, typically due to BIOS bugs), we fall back to nommu. It doesn't work for the majority since nowadays we have more than 4GB memory so we must use swiotlb instead of nommu. The problem is that it's too late to initialize swiotlb when HW IOMMU initialization fails. We need to allocate swiotlb memory earlier from bootmem allocator. Chris explained the issue in detail: http://marc.info/?l=linux-kernel&m=125657444317079&w=2 The current x86 IOMMU initialization sequence is too complicated and handling the above issue makes it more hacky. This patch changes x86 IOMMU initialization sequence to handle the above issue cleanly. The new x86 IOMMU initialization sequence are: 1. we initialize the swiotlb (and setting swiotlb to 1) in the case of (max_pfn > MAX_DMA32_PFN && !no_iommu). dma_ops is set to swiotlb_dma_ops or nommu_dma_ops. if swiotlb usage is forced by the boot option, we finish here. 2. we call the detection functions of all the IOMMUs 3. the detection function sets x86_init.iommu.iommu_init to the IOMMU initialization function (so we can avoid calling the initialization functions of all the IOMMUs needlessly). 4. if the IOMMU initialization function doesn't need to swiotlb then sets swiotlb to zero (e.g. the initialization is sucessful). 5. if we find that swiotlb is set to zero, we free swiotlb resource. Signed-off-by: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-10-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 2 +- arch/x86/kernel/aperture_64.c | 2 +- arch/x86/kernel/pci-calgary_64.c | 10 +--------- arch/x86/kernel/pci-dma.c | 21 +++++++++++++-------- arch/x86/kernel/pci-gart_64.c | 1 + arch/x86/kernel/pci-nommu.c | 9 --------- arch/x86/kernel/pci-swiotlb.c | 7 +++---- 8 files changed, 21 insertions(+), 33 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0285521e0a99..66237fde758f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2110,8 +2110,8 @@ int __init amd_iommu_init_dma_ops(void) prealloc_protection_domains(); iommu_detected = 1; - force_iommu = 1; bad_dma_address = 0; + swiotlb = 0; #ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c41aabddaa2a..0d4581e602a4 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1330,7 +1330,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) + if (no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 03933cf0b63c..e0dfb6856aa2 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -458,7 +458,7 @@ out: if (aper_alloc) { /* Got the aperture from the AGP bridge */ - } else if (swiotlb && !valid_agp) { + } else if (!valid_agp) { /* Do nothing */ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || force_iommu || diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 47bd419ea4d2..833f491440b9 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1360,7 +1360,7 @@ void __init detect_calgary(void) * if the user specified iommu=off or iommu=soft or we found * another HW IOMMU already, bail out. */ - if (swiotlb || no_iommu || iommu_detected) + if (no_iommu || iommu_detected) return; if (!use_calgary) @@ -1445,10 +1445,6 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", specified_table_size); - /* swiotlb for devices that aren't behind the Calgary. */ - if (max_pfn > MAX_DMA32_PFN) - swiotlb = 1; - x86_init.iommu.iommu_init = calgary_iommu_init; } return; @@ -1476,11 +1472,7 @@ int __init calgary_iommu_init(void) return ret; } - force_iommu = 1; bad_dma_address = 0x0; - /* dma_ops is set to swiotlb or nommu */ - if (!dma_ops) - dma_ops = &nommu_dma_ops; return 0; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index bed05e2e5890..a234e63c2656 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -124,24 +124,24 @@ static void __init dma32_free_bootmem(void) void __init pci_iommu_alloc(void) { + /* swiotlb is forced by the boot option */ + int use_swiotlb = swiotlb; #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif + pci_swiotlb_init(); + if (use_swiotlb) + return; - /* - * The order of these functions is important for - * fall-back/fail-over reasons - */ gart_iommu_hole_init(); detect_calgary(); detect_intel_iommu(); + /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); - - pci_swiotlb_init(); } void *dma_generic_alloc_coherent(struct device *dev, size_t size, @@ -291,10 +291,15 @@ static int __init pci_iommu_init(void) #ifdef CONFIG_PCI dma_debug_add_bus(&pci_bus_type); #endif - x86_init.iommu.iommu_init(); - no_iommu_init(); + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: " + "Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else + swiotlb_free(); + return 0; } /* Must execute after PCI subsystem */ diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0410bd30060d..919182e15d1e 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -833,6 +833,7 @@ int __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; x86_platform.iommu_shutdown = gart_iommu_shutdown; + swiotlb = 0; return 0; } diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index a3933d4330cd..875e3822ae61 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = { .sync_sg_for_device = nommu_sync_sg_for_device, .is_phys = 1, }; - -void __init no_iommu_init(void) -{ - if (dma_ops) - return; - - force_iommu = 0; /* no HW IOMMU */ - dma_ops = &nommu_dma_ops; -} diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index ea20ef7ca523..17ce4221bd03 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -46,13 +46,12 @@ void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) + if (!no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif - if (swiotlb_force) - swiotlb = 1; if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; - } + } else + dma_ops = &nommu_dma_ops; } -- cgit v1.2.1 From 72d03802b8b5c841ab1da82bff0652628cbadf60 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 10 Nov 2009 21:35:17 +0900 Subject: x86, 32-bit: Fix swiotlb boot crash Ingo Molnar reported this boot crash: [ 8.655620] pata_amd 0000:00:06.0: version 0.4.1 [ 8.660286] BUG: unable to handle kernel NULL pointer dereference at 00000034 [ 8.663572] IP: [] dma_supported+0x3b/0xa4 [ 8.663572] *pde = 00000000 Initialize dma_ops properly in the 32-bit case. Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a234e63c2656..63eebee80e75 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -129,6 +129,8 @@ void __init pci_iommu_alloc(void) #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); +#else + dma_ops = &nommu_dma_ops; #endif pci_swiotlb_init(); if (use_swiotlb) -- cgit v1.2.1 From b4941a9a606f0131559cc040b64e8437ac7b32c5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 10 Nov 2009 14:37:58 +0100 Subject: x86: Add iommu_init to x86_init_ops, fix build Most of the time x86_init.h is included in pci-dma.c - but not always, leading to this rare build failure: arch/x86/kernel/pci-dma.c:296: error: 'x86_init' undeclared (first use in this function) So include asm/x86_init.h explicitly. Cc: FUJITA Tomonori Cc: chrisw@sous-sol.org Cc: dwmw2@infradead.org Cc: joerg.roedel@amd.com Cc: muli@il.ibm.com LKML-Reference: <1257849980-22640-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 63eebee80e75..f79870e89266 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -11,6 +11,7 @@ #include #include #include +#include static int forbid_dac __read_mostly; -- cgit v1.2.1 From 85160b92fbd35321104819283c91bfed2b553e3c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 10 Nov 2009 13:49:24 -0500 Subject: x86: Add new Intel CPU cache size descriptors The latest rev of Intel doc AP-485 details new cache descriptors that we don't yet support. 12MB, 18MB and 24MB 24-way assoc L3 caches. Signed-off-by: Dave Jones LKML-Reference: <20091110184924.GA20337@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 804c40e2bc3e..14103924b627 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -102,6 +102,9 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */ { 0x00, 0, 0} }; -- cgit v1.2.1 From e02e0e1a130b9ca37c5186d38ad4b3aaf58bb149 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 10 Nov 2009 15:01:20 -0500 Subject: x86: Fix typo in Intel CPU cache size descriptor I double-checked the datasheet. One of the existing descriptors has a typo: it should be 2MB not 2038 KB. Signed-off-by: Dave Jones Cc: # .3x.x: 85160b9: x86: Add new Intel CPU cache size descriptors Cc: # .3x.x LKML-Reference: <20091110200120.GA27090@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 14103924b627..8178d0352935 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -94,7 +94,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */ { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ -- cgit v1.2.1 From ce6b5d768c79b9d5dd6345c033bae781d5ca9b8e Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Wed, 11 Nov 2009 15:51:25 +0800 Subject: x86: Mark the thermal init functions __init Mark the thermal init functions __init so that the init memory can be freed. Signed-off-by: Yong Wang LKML-Reference: <20091111075125.GA17900@ywang-moblin2.bj.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 7f3cf36ed124..8a73d5c12a05 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -256,7 +256,7 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } -void mcheck_intel_therm_init(void) +void __init mcheck_intel_therm_init(void) { /* * This function is only called on boot CPU. Save the init thermal @@ -268,7 +268,7 @@ void mcheck_intel_therm_init(void) lvtthmr_init = apic_read(APIC_LVTTHMR); } -void intel_init_thermal(struct cpuinfo_x86 *c) +void __init intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); int tm2 = 0; -- cgit v1.2.1 From b18485e7acfe1a634615d1c628ef644c0d58d472 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 12 Nov 2009 00:03:28 +0900 Subject: swiotlb: Remove the swiotlb variable usage POWERPC doesn't expect it to be used. This fixes the linux-next build failure reported by Stephen Rothwell: lib/swiotlb.c: In function 'setup_io_tlb_npages': lib/swiotlb.c:114: error: 'swiotlb' undeclared (first use in this function) Reported-by: Stephen Rothwell Signed-off-by: FUJITA Tomonori Cc: peterz@infradead.org LKML-Reference: <20091112000258F.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 5 +---- arch/x86/kernel/pci-swiotlb.c | 13 ++++++++++++- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f79870e89266..0b11bf18f540 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -125,16 +125,13 @@ static void __init dma32_free_bootmem(void) void __init pci_iommu_alloc(void) { - /* swiotlb is forced by the boot option */ - int use_swiotlb = swiotlb; #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #else dma_ops = &nommu_dma_ops; #endif - pci_swiotlb_init(); - if (use_swiotlb) + if (pci_swiotlb_init()) return; gart_iommu_hole_init(); diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 17ce4221bd03..a6e5d0ffa3a7 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -42,16 +42,27 @@ static struct dma_map_ops swiotlb_dma_ops = { .dma_supported = NULL, }; -void __init pci_swiotlb_init(void) +/* + * pci_swiotlb_init - initialize swiotlb if necessary + * + * This returns non-zero if we are forced to use swiotlb (by the boot + * option). + */ +int __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 if (!no_iommu && max_pfn > MAX_DMA32_PFN) swiotlb = 1; #endif + if (swiotlb_force) + swiotlb = 1; + if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } else dma_ops = &nommu_dma_ops; + + return swiotlb_force; } -- cgit v1.2.1 From 9f15226e75583547aaf542c6be4bdac1060dd425 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 11 Nov 2009 20:03:29 +0100 Subject: x86, ucode-amd: Ensure ucode update on suspend/resume after CPU off/online cycle When switching a CPU offline/online and then doing suspend/resume, ucode is not updated on this CPU. This is due to the microcode_fini_cpu() call which frees uci->mc when setting the CPU offline: static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; vfree(uci->mc); uci->mc = NULL; } When the CPU is set online uci->mc is still NULL because no ucode update is required. Finally this prevents ucode update when resuming after suspend: static enum ucode_state microcode_resume_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!uci->mc) return UCODE_NFOUND; ... } Fix is to check whether uci->mc is valid before microcode_resume_cpu() is called. Signed-off-by: Andreas Herrmann Cc: dimm LKML-Reference: <20091111190329.GF18592@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index d2a816021d9f..adf234061540 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -393,7 +393,7 @@ static enum ucode_state microcode_update_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; enum ucode_state ustate; - if (uci->valid) + if (uci->valid && uci->mc) ustate = microcode_resume_cpu(cpu); else ustate = microcode_init_cpu(cpu); -- cgit v1.2.1 From cffd377e5879ea58522224a785a083f201afd80e Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 12 Nov 2009 15:52:40 +0900 Subject: x86, mce: Fix __init annotations The intel_init_thermal() is called from resume path, so it cannot be marked as __init. OTOH mce_banks_init() is only called from __mcheck_cpu_cap_init() which is marked as __cpuinit, so it can be also marked as __cpuinit. Signed-off-by: Hidetoshi Seto Acked-by: Yong Wang LKML-Reference: <4AFBB0B8.2070501@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0d4102031a4c..5f277cad2ed7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1201,7 +1201,7 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int mce_banks_init(void) +static int __cpuinit __mcheck_cpu_mce_banks_init(void) { int i; @@ -1242,7 +1242,7 @@ static int __cpuinit __mcheck_cpu_cap_init(void) WARN_ON(banks != 0 && b != banks); banks = b; if (!mce_banks) { - int err = mce_banks_init(); + int err = __mcheck_cpu_mce_banks_init(); if (err) return err; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 8a73d5c12a05..4fef985fc221 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -268,7 +268,7 @@ void __init mcheck_intel_therm_init(void) lvtthmr_init = apic_read(APIC_LVTTHMR); } -void __init intel_init_thermal(struct cpuinfo_x86 *c) +void intel_init_thermal(struct cpuinfo_x86 *c) { unsigned int cpu = smp_processor_id(); int tm2 = 0; -- cgit v1.2.1 From db48cccc7c709ccfa7cb4ac702bc27c216bffee7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 12 Nov 2009 11:25:34 +0900 Subject: perf_event, x86: Annotate init functions and data Annotate init functions and data with __init and __initconst. Signed-off-by: Hiroshi Shimamoto Cc: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: <4AFB721E.8070203@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2e20bca3cca1..bd8743024204 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -245,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; -static const u64 nehalem_hw_cache_event_ids +static __initconst u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -336,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids }, }; -static const u64 core2_hw_cache_event_ids +static __initconst u64 core2_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -427,7 +427,7 @@ static const u64 core2_hw_cache_event_ids }, }; -static const u64 atom_hw_cache_event_ids +static __initconst u64 atom_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -536,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event) return hw_event & CORE_EVNTSEL_MASK; } -static const u64 amd_hw_cache_event_ids +static __initconst u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -1964,7 +1964,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = { .priority = 1 }; -static struct x86_pmu p6_pmu = { +static __initconst struct x86_pmu p6_pmu = { .name = "p6", .handle_irq = p6_pmu_handle_irq, .disable_all = p6_pmu_disable_all, @@ -1992,7 +1992,7 @@ static struct x86_pmu p6_pmu = { .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu intel_pmu = { +static __initconst struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, .disable_all = intel_pmu_disable_all, @@ -2016,7 +2016,7 @@ static struct x86_pmu intel_pmu = { .get_event_idx = intel_get_event_idx, }; -static struct x86_pmu amd_pmu = { +static __initconst struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, @@ -2037,7 +2037,7 @@ static struct x86_pmu amd_pmu = { .get_event_idx = gen_get_event_idx, }; -static int p6_pmu_init(void) +static __init int p6_pmu_init(void) { switch (boot_cpu_data.x86_model) { case 1: @@ -2071,7 +2071,7 @@ static int p6_pmu_init(void) return 0; } -static int intel_pmu_init(void) +static __init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -2144,7 +2144,7 @@ static int intel_pmu_init(void) return 0; } -static int amd_pmu_init(void) +static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) -- cgit v1.2.1 From 24a065624dcdd91e8bfd0f14113feb91c7ed11ca Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 3 Apr 2009 05:33:18 -0700 Subject: sysctl x86: Remove dead binary sysctl support Now that sys_sysctl is a generic wrapper around /proc/sys .ctl_name and .strategy members of sysctl tables are dead code. Remove them. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Signed-off-by: Eric W. Biederman --- arch/x86/kernel/vsyscall_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8cb4974ff599..e02d92d12bcd 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -237,7 +237,7 @@ static ctl_table kernel_table2[] = { }; static ctl_table kernel_root_table2[] = { - { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, + { .procname = "kernel", .mode = 0555, .child = kernel_table2 }, {} }; -- cgit v1.2.1 From 15cd8812ab2ce62a2f779e93a8398bdad752291a Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 12 Nov 2009 18:15:43 -0500 Subject: x86: Remove the CPU cache size printk's They aren't really useful, and they pollute the dmesg output a lot (especially on machines with many cores). Also the same information can be trivially found out from userspace. Reported-by: Mike Travis Signed-off-by: Dave Jones Acked-by: H. Peter Anvin Cc: Andi Kleen Cc: Heiko Carstens Cc: Roland Dreier Cc: Randy Dunlap Cc: Tejun Heo Cc: Greg Kroah-Hartman Cc: Yinghai Lu Cc: David Rientjes Cc: Steven Rostedt Cc: Rusty Russell Cc: Hidetoshi Seto Cc: Jack Steiner Cc: Frederic Weisbecker LKML-Reference: <20091112231542.GA7129@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 804c40e2bc3e..0df4c2b7107f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -488,22 +488,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) #endif } - if (trace) - printk(KERN_INFO "CPU: Trace cache: %dK uops", trace); - else if (l1i) - printk(KERN_INFO "CPU: L1 I cache: %dK", l1i); - - if (l1d) - printk(KERN_CONT ", L1 D cache: %dK\n", l1d); - else - printk(KERN_CONT "\n"); - - if (l2) - printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); - - if (l3) - printk(KERN_INFO "CPU: L3 cache: %dK\n", l3); - c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); return l2; -- cgit v1.2.1 From 0388423dba2217b4e5b6c61690b0506d13b25a49 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 13 Nov 2009 15:30:00 -0500 Subject: x86: Minimise printk spew from per-vendor init code In the default case where the kernel supports all CPU vendors, we currently print out a bunch of not useful messages on every system. 32-bit: KERNEL supported cpus: Intel GenuineIntel AMD AuthenticAMD NSC Geode by NSC Cyrix CyrixInstead Centaur CentaurHauls Transmeta GenuineTMx86 Transmeta TransmetaCPU UMC UMC UMC UMC 64-bit: KERNEL supported cpus: Intel GenuineIntel AMD AuthenticAMD Centaur CentaurHauls Given that "what CPUs does the kernel support" isn't useful for the "support everything" case, we can suppress these printk's. Signed-off-by: Dave Jones LKML-Reference: <20091113203000.GA19160@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc25c2b4a567..617a29f95b3c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -656,6 +656,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) void __init early_cpu_init(void) { +#ifdef PROCESSOR_SELECT const struct cpu_dev *const *cdev; int count = 0; @@ -676,7 +677,7 @@ void __init early_cpu_init(void) cpudev->c_ident[j]); } } - +#endif early_identify_cpu(&boot_cpu_data); } -- cgit v1.2.1 From b01c845f0f2e3f9e54e6a78d5d56895f5b95e818 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 13 Nov 2009 14:38:26 -0800 Subject: x86: Remove CPU cache size output for non-Intel too As Dave Jones said about the output in intel_cacheinfo.c: "They aren't useful, and pollute the dmesg output a lot (especially on machines with many cores). Also the same information can be trivially found out from userspace." Give the generic display_cacheinfo() function the same treatment. Signed-off-by: Roland Dreier Acked-by: Dave Jones Cc: Mike Travis Cc: Andi Kleen Cc: Heiko Carstens Cc: Randy Dunlap Cc: Tejun Heo Cc: Greg Kroah-Hartman Cc: Yinghai Lu Cc: David Rientjes Cc: Steven Rostedt Cc: Rusty Russell Cc: Hidetoshi Seto Cc: Jack Steiner Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 617a29f95b3c..9db1e2425c27 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); c->x86_cache_size = (ecx>>24) + (edx>>24); #ifdef CONFIG_X86_64 /* On K8 L1 TLB is inclusive, so don't count it */ @@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) #endif c->x86_cache_size = l2size; - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); } void __cpuinit detect_ht(struct cpuinfo_x86 *c) -- cgit v1.2.1 From 31c997cac76e62918858a432fff6e43fd48425f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 14 Nov 2009 10:34:41 +0100 Subject: x86: Fix cpu_devs[] initialization in early_cpu_init() Yinghai Lu noticed that this commit: 0388423: x86: Minimise printk spew from per-vendor init code mistakenly left out the initialization of cpu_devs[] in the !PROCESSOR_SELECT case. Fix it. Reported-by: Yinghai Lu Cc: Dave Jones LKML-Reference: <20091113203000.GA19160@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9db1e2425c27..61242a56c2d6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -651,28 +651,34 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) void __init early_cpu_init(void) { -#ifdef PROCESSOR_SELECT const struct cpu_dev *const *cdev; int count = 0; +#ifdef PROCESSOR_SELECT printk(KERN_INFO "KERNEL supported cpus:\n"); +#endif + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { const struct cpu_dev *cpudev = *cdev; - unsigned int j; if (count >= X86_VENDOR_NUM) break; cpu_devs[count] = cpudev; count++; - for (j = 0; j < 2; j++) { - if (!cpudev->c_ident[j]) - continue; - printk(KERN_INFO " %s %s\n", cpudev->c_vendor, - cpudev->c_ident[j]); +#ifdef PROCESSOR_SELECT + { + unsigned int j; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + printk(KERN_INFO " %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } } - } #endif + } early_identify_cpu(&boot_cpu_data); } -- cgit v1.2.1 From 68efa37df779b3e04280598e8b5b3a1919b65fee Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 14 Nov 2009 01:35:29 +0100 Subject: hw-breakpoints, x86: Fix modular KVM build This build error: arch/x86/kvm/x86.c:3655: error: implicit declaration of function 'hw_breakpoint_restore' Happens because in the CONFIG_KVM=m case there's no 'CONFIG_KVM' define in the kernel - it's CONFIG_KVM_MODULE in that case. Make the prototype available unconditionally. Cc: Frederic Weisbecker Cc: Prasad LKML-Reference: <1258114575-32655-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 57dcee5fa958..752daebe91c6 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -43,6 +43,7 @@ /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, dr7); +EXPORT_PER_CPU_SYMBOL(dr7); /* Per cpu debug address registers values */ static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); @@ -409,6 +410,7 @@ void aout_dump_debugregs(struct user *dump) dump->u_debugreg[7] = dr7; } +EXPORT_SYMBOL_GPL(aout_dump_debugregs); /* * Release the user breakpoints used by ptrace @@ -424,7 +426,6 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) } } -#ifdef CONFIG_KVM void hw_breakpoint_restore(void) { set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); @@ -435,7 +436,6 @@ void hw_breakpoint_restore(void) set_debugreg(__get_cpu_var(dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); -#endif /* * Handle debug exception notifications. -- cgit v1.2.1 From a3b28ee1090072092e2be043c24df94230e725b2 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sat, 14 Nov 2009 20:46:36 +0900 Subject: x86: Set dma_ops to nommu_dma_ops by default We set dma_ops to nommu_dma_ops at two different places for x86_32 and x86_64. This unifies them by setting dma_ops to nommu_dma_ops by default. Signed-off-by: FUJITA Tomonori LKML-Reference: <1258199198-16657-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 4 +--- arch/x86/kernel/pci-swiotlb.c | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0b11bf18f540..f170b5364b41 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -15,7 +15,7 @@ static int forbid_dac __read_mostly; -struct dma_map_ops *dma_ops; +struct dma_map_ops *dma_ops = &nommu_dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -128,8 +128,6 @@ void __init pci_iommu_alloc(void) #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); -#else - dma_ops = &nommu_dma_ops; #endif if (pci_swiotlb_init()) return; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index a6e5d0ffa3a7..e36e71daa44c 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -61,8 +61,7 @@ int __init pci_swiotlb_init(void) if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; - } else - dma_ops = &nommu_dma_ops; + } return swiotlb_force; } -- cgit v1.2.1 From 94a15564ac63af6bb2ff8d4d04f86d5e7ee0278a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sat, 14 Nov 2009 20:46:37 +0900 Subject: x86: Move iommu_shutdown_noop to x86_init.c iommu_init_noop() is in arch/x86/kernel/x86_init.c but iommu_shutdown_noop() in arch/x86/include/asm/iommu.h. This moves iommu_shutdown_noop() to x86_init.c for consistency. Signed-off-by: FUJITA Tomonori LKML-Reference: <1258199198-16657-3-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/x86_init.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c46984d122dc..80f3ae24b974 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -20,6 +20,7 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } +void __init iommu_shutdown_noop(void) { } /* * The platform setup functions are preset with the default functions -- cgit v1.2.1 From f4131c6259b46bd84dcfcd3bb9ed08e99e2875a4 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sat, 14 Nov 2009 21:26:50 +0900 Subject: x86: Make calgary_iommu_init() static This makes calgary_iommu_init() static and moves it to remove the forward declaration. Signed-off-by: FUJITA Tomonori Cc: muli@il.ibm.com LKML-Reference: <20091114212603U.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 833f491440b9..c84ad037f586 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1345,7 +1345,24 @@ static void __init get_tce_space_from_tar(void) return; } -int __init calgary_iommu_init(void); +static int __init calgary_iommu_init(void) +{ + int ret; + + /* ok, we're trying to use Calgary - let's roll */ + printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); + + ret = calgary_init(); + if (ret) { + printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " + "falling back to no_iommu\n", ret); + return ret; + } + + bad_dma_address = 0x0; + + return 0; +} void __init detect_calgary(void) { @@ -1458,25 +1475,6 @@ cleanup: } } -int __init calgary_iommu_init(void) -{ - int ret; - - /* ok, we're trying to use Calgary - let's roll */ - printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); - - ret = calgary_init(); - if (ret) { - printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " - "falling back to no_iommu\n", ret); - return ret; - } - - bad_dma_address = 0x0; - - return 0; -} - static int __init calgary_parse_options(char *p) { unsigned int bridge; -- cgit v1.2.1 From 14722485830fe6baba738b91d96f06fbd6cf7a18 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 13 Nov 2009 11:56:24 +0000 Subject: x86-64: __copy_from_user_inatomic() adjustments This v2.6.26 commit: ad2fc2c: x86: fix copy_user on x86 rendered __copy_from_user_inatomic() identical to copy_user_generic(), yet didn't make the former just call the latter from an inline function. Furthermore, this v2.6.19 commit: b885808: [PATCH] Add proper sparse __user casts to __copy_to_user_inatomic converted the return type of __copy_to_user_inatomic() from unsigned long to int, but didn't do the same to __copy_from_user_inatomic(). Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Alexander Viro Cc: Arjan van de Ven Cc: Andi Kleen Cc: LKML-Reference: <4AFD5778020000780001F8F4@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/x8664_ksyms_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a0cdd8cc1d67..cd54276b6be8 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -32,7 +32,6 @@ EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(__copy_from_user_inatomic); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); -- cgit v1.2.1 From dc186ad741c12ae9ecac8b89e317ef706fdaf8f6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Nov 2009 01:09:48 +0900 Subject: workqueue: Add debugobjects support Add debugobject support to track the life time of work_structs. While at it, remove duplicate definition of INIT_DELAYED_WORK_ON_STACK(). Signed-off-by: Thomas Gleixner Signed-off-by: Tejun Heo --- arch/x86/kernel/smpboot.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 565ebc65920e..ba43dfed353d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -687,7 +687,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - INIT_WORK(&c_idle.work, do_fork_idle); + INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); @@ -713,6 +713,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); + destroy_work_on_stack(&c_idle.work); return PTR_ERR(c_idle.idle); } @@ -831,6 +832,7 @@ do_rest: smpboot_restore_warm_reset_vector(); } + destroy_work_on_stack(&c_idle.work); return boot_error; } -- cgit v1.2.1 From 62ad33f67003b9a7b6013f0511579b9805e11626 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 16 Nov 2009 11:44:30 +0900 Subject: x86: Don't put iommu_shutdown_noop() in init section It causes kernel panic on shutdown or reboot. Signed-off-by: Hiroshi Shimamoto Acked-by: FUJITA Tomonori LKML-Reference: <4B00BC8E.50801@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/x86_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 80f3ae24b974..d11c5ff7c65e 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -20,7 +20,7 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } -void __init iommu_shutdown_noop(void) { } +void iommu_shutdown_noop(void) { } /* * The platform setup functions are preset with the default functions -- cgit v1.2.1 From 411462f62a65eeae7f451c6eb7a38b9d8759c61a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Nov 2009 11:52:39 +0100 Subject: x86: Fix printk format due to variable type change clockevents.mult became u32. Fix the printk format. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 894aa97f0717..cf4ee5195c5e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -662,7 +662,7 @@ static int __init calibrate_APIC_clock(void) calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", calibration_result); -- cgit v1.2.1 From 303fc0870f8fbfabe260c5c32b18e53458d597ea Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Thu, 12 Nov 2009 13:09:31 -0500 Subject: x86: AMD Northbridge: Verify NB's node is online Fix panic seen on some IBM and HP systems on 2.6.32-rc6: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] find_next_bit+0x77/0x9c [...] [] cpumask_next_and+0x2e/0x3b [] pci_device_probe+0x8e/0xf5 [] ? driver_sysfs_add+0x47/0x6c [] driver_probe_device+0xd9/0x1f9 [] __driver_attach+0x58/0x7c [] ? __driver_attach+0x0/0x7c [] bus_for_each_dev+0x54/0x89 [] driver_attach+0x19/0x1b [] bus_add_driver+0xd3/0x23d [] driver_register+0x98/0x109 [] __pci_register_driver+0x63/0xd3 [] ? up_read+0x26/0x2a [] ? k8temp_init+0x0/0x20 [k8temp] [] k8temp_init+0x1e/0x20 [k8temp] [] do_one_initcall+0x6d/0x185 [] sys_init_module+0xd3/0x236 [] system_call_fastpath+0x16/0x1b I put in a printk and commented out the set_dev_node() call when and got this output: quirk_amd_nb_node: current numa_node = 0x0, would set to val & 7 = 0x0 quirk_amd_nb_node: current numa_node = 0x0, would set to val & 7 = 0x1 quirk_amd_nb_node: current numa_node = 0x0, would set to val & 7 = 0x2 quirk_amd_nb_node: current numa_node = 0x0, would set to val & 7 = 0x3 I.e. the issue appears to be that the HW has set val to a valid value, however, the system is only configured for a single node -- 0, the others are offline. Check to see if the node is actually online before setting the numa node for an AMD northbridge in quirk_amd_nb_node(). Signed-off-by: Prarit Bhargava Cc: bhavna.sarathy@amd.com Cc: jbarnes@virtuousgeek.org Cc: andreas.herrmann3@amd.com LKML-Reference: <20091112180933.12532.98685.sendpatchset@prarit.bos.redhat.com> [ v2: clean up the code and add comments ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 6c3b2c6fd772..18093d7498f0 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -499,6 +499,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) { struct pci_dev *nb_ht; unsigned int devfn; + u32 node; u32 val; devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); @@ -507,7 +508,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev) return; pci_read_config_dword(nb_ht, 0x60, &val); - set_dev_node(&dev->dev, val & 7); + node = val & 7; + /* + * Some hardware may return an invalid node ID, + * so check it first: + */ + if (node_online(node)) + set_dev_node(&dev->dev, node); pci_dev_put(nb_ht); } -- cgit v1.2.1 From 3c93ca00eeeb774c7dd666cc7286a9e90c53e998 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 16 Nov 2009 15:42:18 +0100 Subject: x86: Add missing might_fault() checks to copy_{to,from}_user() On x86-64, copy_[to|from]_user() rely on assembly routines that never call might_fault(), making us missing various lockdep checks. This doesn't apply to __copy_from,to_user() that explicitly handle these calls, neither is it a problem in x86-32 where copy_to,from_user() rely on the "__" prefixed versions that also call might_fault(). Signed-off-by: Frederic Weisbecker Cc: Arjan van de Ven Cc: Linus Torvalds Cc: Nick Piggin Cc: Peter Zijlstra LKML-Reference: <1258382538-30979-1-git-send-email-fweisbec@gmail.com> [ v2: fix module export ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/x8664_ksyms_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index cd54276b6be8..a1029769b6f2 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -31,7 +31,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); -EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(_copy_to_user); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); -- cgit v1.2.1 From e79c65a97c01d5da4317f44f9f98b3814e091a43 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 16 Nov 2009 18:14:26 +0300 Subject: x86: io-apic: IO-APIC MMIO should not fail on resource insertion If IO-APIC base address is 1K aligned we should not fail on resourse insertion procedure. For this sake we define IO_APIC_SLOT_SIZE constant which should cover all IO-APIC direct accessible registers. An example of a such configuration is there http://marc.info/?l=linux-kernel&m=118114792006520 | | Quoting the message | | IOAPIC[0]: apic_id 2, version 32, address 0xfec00000, GSI 0-23 | IOAPIC[1]: apic_id 3, version 32, address 0xfec80000, GSI 24-47 | IOAPIC[2]: apic_id 4, version 32, address 0xfec80400, GSI 48-71 | IOAPIC[3]: apic_id 5, version 32, address 0xfec84000, GSI 72-95 | IOAPIC[4]: apic_id 8, version 32, address 0xfec84400, GSI 96-119 | Reported-by: "Maciej W. Rozycki" Signed-off-by: Cyrill Gorcunov Acked-by: Yinghai Lu LKML-Reference: <20091116151426.GC5653@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 20ea8392bc57..ff237199fa23 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4100,18 +4100,17 @@ void __init ioapic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); + apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), + ioapic_phys); idx++; ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + PAGE_SIZE-1; + ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } } -- cgit v1.2.1 From 42109197eb7c01080eea6d9cd48ca23cbc3c566c Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sun, 15 Nov 2009 21:19:52 +0900 Subject: x86: gart: Add own dma_mapping_error function GART IOMMU is the only user of bad_dma_address variable. This patch converts GART to use the newer mechanism, fill in ->mapping_error() in struct dma_map_ops, to make dma_mapping_error() work in IOMMU specific way. Signed-off-by: FUJITA Tomonori Acked-by: Jesse Barnes Cc: muli@il.ibm.com Cc: joerg.roedel@amd.com LKML-Reference: <1258287594-8777-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 919182e15d1e..61c4d1e41a6b 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -47,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */ static u32 *iommu_gatt_base; /* Remapping table */ +static dma_addr_t bad_dma_addr; + /* * If this is disabled the IOMMU will use an optimized flushing strategy * of only flushing when an mapping is reused. With it true the GART is @@ -217,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, if (panic_on_overflow) panic("dma_map_area overflow %lu bytes\n", size); iommu_full(dev, size, dir); - return bad_dma_address; + return bad_dma_addr; } for (i = 0; i < npages; i++) { @@ -303,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, if (nonforced_iommu(dev, addr, s->length)) { addr = dma_map_area(dev, addr, s->length, dir, 0); - if (addr == bad_dma_address) { + if (addr == bad_dma_addr) { if (i > 0) gart_unmap_sg(dev, sg, i, dir, NULL); nents = 0; @@ -456,7 +458,7 @@ error: iommu_full(dev, pages << PAGE_SHIFT, dir); for_each_sg(sg, s, nents, i) - s->dma_address = bad_dma_address; + s->dma_address = bad_dma_addr; return 0; } @@ -480,7 +482,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, DMA_BIDIRECTIONAL, align_mask); flush_gart(); - if (paddr != bad_dma_address) { + if (paddr != bad_dma_addr) { *dma_addr = paddr; return page_address(page); } @@ -500,6 +502,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return (dma_addr == bad_dma_addr); +} + static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -687,6 +694,7 @@ static struct dma_map_ops gart_dma_ops = { .unmap_page = gart_unmap_page, .alloc_coherent = gart_alloc_coherent, .free_coherent = gart_free_coherent, + .mapping_error = gart_mapping_error, }; static void gart_iommu_shutdown(void) @@ -785,7 +793,7 @@ int __init gart_iommu_init(void) iommu_start = aper_size - iommu_size; iommu_bus_base = info.aper_base + iommu_start; - bad_dma_address = iommu_bus_base; + bad_dma_addr = iommu_bus_base; iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); /* -- cgit v1.2.1 From 8fd524b355daef0945692227e726fb444cebcd4f Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sun, 15 Nov 2009 21:19:53 +0900 Subject: x86: Kill bad_dma_address variable This kills bad_dma_address variable, the old mechanism to enable IOMMU drivers to make dma_mapping_error() work in IOMMU's specific way. bad_dma_address variable was introduced to enable IOMMU drivers to make dma_mapping_error() work in IOMMU's specific way. However, it can't handle systems that use both swiotlb and HW IOMMU. SO we introduced dma_map_ops->mapping_error to solve that case. Intel VT-d, GART, and swiotlb already use dma_map_ops->mapping_error. Calgary, AMD IOMMU, and nommu use zero for an error dma address. This adds DMA_ERROR_CODE and converts them to use it (as SPARC and POWER does). Signed-off-by: FUJITA Tomonori Acked-by: Jesse Barnes Cc: muli@il.ibm.com Cc: joerg.roedel@amd.com LKML-Reference: <1258287594-8777-3-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 21 ++++++++++----------- arch/x86/kernel/pci-calgary_64.c | 22 ++++++++++------------ arch/x86/kernel/pci-dma.c | 3 --- arch/x86/kernel/pci-nommu.c | 2 +- 4 files changed, 21 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 66237fde758f..093bd526c949 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -928,7 +928,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, } if (unlikely(address == -1)) - address = bad_dma_address; + address = DMA_ERROR_CODE; WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); @@ -1544,7 +1544,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, pte = dma_ops_get_pte(dom, address); if (!pte) - return bad_dma_address; + return DMA_ERROR_CODE; __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; @@ -1625,7 +1625,7 @@ static dma_addr_t __map_single(struct device *dev, retry: address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, dma_mask); - if (unlikely(address == bad_dma_address)) { + if (unlikely(address == DMA_ERROR_CODE)) { /* * setting next_address here will let the address * allocator only scan the new allocated range in the @@ -1646,7 +1646,7 @@ retry: start = address; for (i = 0; i < pages; ++i) { ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); - if (ret == bad_dma_address) + if (ret == DMA_ERROR_CODE) goto out_unmap; paddr += PAGE_SIZE; @@ -1674,7 +1674,7 @@ out_unmap: dma_ops_free_addresses(dma_dom, address, pages); - return bad_dma_address; + return DMA_ERROR_CODE; } /* @@ -1690,7 +1690,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_addr_t i, start; unsigned int pages; - if ((dma_addr == bad_dma_address) || + if ((dma_addr == DMA_ERROR_CODE) || (dma_addr + size > dma_dom->aperture_size)) return; @@ -1732,7 +1732,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, INC_STATS_COUNTER(cnt_map_single); if (!check_device(dev)) - return bad_dma_address; + return DMA_ERROR_CODE; dma_mask = *dev->dma_mask; @@ -1743,12 +1743,12 @@ static dma_addr_t map_page(struct device *dev, struct page *page, return (dma_addr_t)paddr; if (!dma_ops_domain(domain)) - return bad_dma_address; + return DMA_ERROR_CODE; spin_lock_irqsave(&domain->lock, flags); addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, dma_mask); - if (addr == bad_dma_address) + if (addr == DMA_ERROR_CODE) goto out; iommu_completion_wait(iommu); @@ -1957,7 +1957,7 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) { + if (*dma_addr == DMA_ERROR_CODE) { spin_unlock_irqrestore(&domain->lock, flags); goto out_free; } @@ -2110,7 +2110,6 @@ int __init amd_iommu_init_dma_ops(void) prealloc_protection_domains(); iommu_detected = 1; - bad_dma_address = 0; swiotlb = 0; #ifdef CONFIG_GART_IOMMU gart_iommu_aperture_disabled = 1; diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index c84ad037f586..af9f436096a2 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -245,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev, if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else - return bad_dma_address; + return DMA_ERROR_CODE; } } @@ -261,11 +261,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, void *vaddr, unsigned int npages, int direction) { unsigned long entry; - dma_addr_t ret = bad_dma_address; + dma_addr_t ret = DMA_ERROR_CODE; entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == bad_dma_address)) + if (unlikely(entry == DMA_ERROR_CODE)) goto error; /* set the return dma address */ @@ -280,7 +280,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, error: printk(KERN_WARNING "Calgary: failed to allocate %u pages in " "iommu %p\n", npages, tbl); - return bad_dma_address; + return DMA_ERROR_CODE; } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, @@ -291,8 +291,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned long flags; /* were we called with bad_dma_address? */ - badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); - if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { + badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); return; @@ -374,7 +374,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); - if (entry == bad_dma_address) { + if (entry == DMA_ERROR_CODE) { /* makes sure unmap knows to stop */ s->dma_length = 0; goto error; @@ -392,7 +392,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, error: calgary_unmap_sg(dev, sg, nelems, dir, NULL); for_each_sg(sg, s, nelems, i) { - sg->dma_address = bad_dma_address; + sg->dma_address = DMA_ERROR_CODE; sg->dma_length = 0; } return 0; @@ -447,7 +447,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, /* set up tces to cover the allocated range */ mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) + if (mapping == DMA_ERROR_CODE) goto free; *dma_handle = mapping; return ret; @@ -728,7 +728,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ - iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); + iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ /* for CalIOC2 - avoid the entire first MB */ @@ -1359,8 +1359,6 @@ static int __init calgary_iommu_init(void) return ret; } - bad_dma_address = 0x0; - return 0; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index bf621b9ee26e..afcc58b69c7c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -43,9 +43,6 @@ int iommu_detected __read_mostly = 0; */ int iommu_pass_through __read_mostly; -dma_addr_t bad_dma_address __read_mostly = 0; -EXPORT_SYMBOL(bad_dma_address); - /* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 875e3822ae61..22be12b60a8f 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, dma_addr_t bus = page_to_phys(page) + offset; WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) - return bad_dma_address; + return DMA_ERROR_CODE; flush_write_buffers(); return bus; } -- cgit v1.2.1 From 1f7564ca831a00b21bb493ef174c845b2ba9e64d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sun, 15 Nov 2009 21:19:54 +0900 Subject: x86: Calgary: Remove unnecessary DMA_ERROR_CODE usage This cleans up iommu_alloc() a bit and removes unnecessary DMA_ERROR_CODE usage. Signed-off-by: FUJITA Tomonori Acked-by: Jesse Barnes Cc: muli@il.ibm.com Cc: joerg.roedel@amd.com LKML-Reference: <1258287594-8777-4-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index af9f436096a2..849a0995d970 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -261,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, void *vaddr, unsigned int npages, int direction) { unsigned long entry; - dma_addr_t ret = DMA_ERROR_CODE; + dma_addr_t ret; entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == DMA_ERROR_CODE)) - goto error; + if (unlikely(entry == DMA_ERROR_CODE)) { + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); + return DMA_ERROR_CODE; + } /* set the return dma address */ ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); @@ -274,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, /* put the TCEs in the HW table */ tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, direction); - return ret; - -error: - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); - return DMA_ERROR_CODE; } static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, -- cgit v1.2.1 From 123bf0e2eddcda36a33bdfc87aa1fb07229f07b5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 15 Nov 2009 21:19:52 +0900 Subject: x86: gart: Clean up the code a bit Clean up various small stylistic details in the GART code. No functionality changed. Cc: FUJITA Tomonori Cc: Jesse Barnes Cc: muli@il.ibm.com Cc: joerg.roedel@amd.com LKML-Reference: <1258287594-8777-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 116 ++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 55 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 61c4d1e41a6b..e6a0d402f171 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -95,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size, base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, + boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; spin_lock_irqsave(&iommu_bitmap_lock, flags); @@ -297,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, int i; #ifdef CONFIG_IOMMU_DEBUG - printk(KERN_DEBUG "dma_map_sg overflow\n"); + pr_debug("dma_map_sg overflow\n"); #endif for_each_sg(sg, s, nents, i) { @@ -392,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, if (!dev) dev = &x86_dma_fallback_dev; - out = 0; - start = 0; - start_sg = sgmap = sg; - seg_size = 0; - max_seg_size = dma_get_max_seg_size(dev); - ps = NULL; /* shut up gcc */ + out = 0; + start = 0; + start_sg = sg; + sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); + ps = NULL; /* shut up gcc */ + for_each_sg(sg, s, nents, i) { dma_addr_t addr = sg_phys(s); @@ -420,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, sgmap, pages, need) < 0) goto error; out++; - seg_size = 0; - sgmap = sg_next(sgmap); - pages = 0; - start = i; - start_sg = s; + + seg_size = 0; + sgmap = sg_next(sgmap); + pages = 0; + start = i; + start_sg = s; } } @@ -523,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { - printk(KERN_WARNING + pr_warning( "PCI-DMA: Warning: Small IOMMU %luMB." " Consider increasing the AGP aperture in BIOS\n", iommu_size >> 20); @@ -578,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc) aperture_alloc = aper_alloc; } -static int gart_resume(struct sys_device *dev) +static void gart_fixup_northbridges(struct sys_device *dev) { - printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); + int i; - if (fix_up_north_bridges) { - int i; + if (!fix_up_north_bridges) + return; - printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); + pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + for (i = 0; i < num_k8_northbridges; i++) { + struct pci_dev *dev = k8_northbridges[i]; - /* - * Don't enable translations just yet. That is the next - * step. Restore the pre-suspend aperture settings. - */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, - aperture_order << 1); - pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, - aperture_alloc >> 25); - } + /* + * Don't enable translations just yet. That is the next + * step. Restore the pre-suspend aperture settings. + */ + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); + pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } +} + +static int gart_resume(struct sys_device *dev) +{ + pr_info("PCI-DMA: Resuming GART IOMMU\n"); + + gart_fixup_northbridges(dev); enable_gart_translations(); @@ -612,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state) } static struct sysdev_class gart_sysdev_class = { - .name = "gart", - .suspend = gart_suspend, - .resume = gart_resume, + .name = "gart", + .suspend = gart_suspend, + .resume = gart_resume, }; static struct sys_device device_gart = { - .id = 0, - .cls = &gart_sysdev_class, + .cls = &gart_sysdev_class, }; /* @@ -635,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) void *gatt; int i, error; - printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); + pr_info("PCI-DMA: Disabling AGP.\n"); + aper_size = aper_base = info->aper_size = 0; dev = NULL; for (i = 0; i < num_k8_northbridges; i++) { @@ -653,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) } if (!aper_base) goto nommu; + info->aper_base = aper_base; info->aper_size = aper_size >> 20; @@ -675,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info) flush_gart(); - printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", + pr_info("PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); return 0; nommu: /* Should not happen anymore */ - printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" + pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" "falling back to iommu=soft.\n"); return -1; } @@ -744,23 +752,23 @@ int __init gart_iommu_init(void) !gart_iommu_aperture || (no_agp && init_k8_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { - printk(KERN_WARNING "More than 4GB of memory " - "but GART IOMMU not available.\n"); - printk(KERN_WARNING "falling back to iommu=soft.\n"); + pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warning("falling back to iommu=soft.\n"); } return 0; } /* need to map that range */ - aper_size = info.aper_size << 20; - aper_base = info.aper_base; - end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + if (end_pfn > max_low_pfn_mapped) { start_pfn = (aper_base>>PAGE_SHIFT); init_memory_mapping(start_pfn<> PAGE_SHIFT; @@ -775,8 +783,7 @@ int __init gart_iommu_init(void) ret = dma_debug_resize_entries(iommu_pages); if (ret) - printk(KERN_DEBUG - "PCI-DMA: Cannot trace all the entries\n"); + pr_debug("PCI-DMA: Cannot trace all the entries\n"); } #endif @@ -786,15 +793,14 @@ int __init gart_iommu_init(void) */ iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - agp_memory_reserved = iommu_size; - printk(KERN_INFO - "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", iommu_size >> 20); - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; - bad_dma_addr = iommu_bus_base; - iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + agp_memory_reserved = iommu_size; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_addr = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); /* * Unmap the IOMMU part of the GART. The alias of the page is @@ -816,7 +822,7 @@ int __init gart_iommu_init(void) * the pages as Not-Present: */ wbinvd(); - + /* * Now all caches are flushed and we can safely enable * GART hardware. Doing it early leaves the possibility -- cgit v1.2.1 From 8cc2361bd00e87aab2827a3996a71fe9b2c9f9c4 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 17 Nov 2009 08:06:38 +0100 Subject: x86: ucode-amd: Move family check to microcde_amd.c's init function ... to avoid useless trial to load firmware on systems with unsupported AMD CPUs. Signed-off-by: Andreas Herrmann Cc: Dmitry Adamushko Cc: Mike Travis Cc: Tigran Aivazian Cc: Borislav Petkov Cc: Andreas Mohr Cc: Jack Steiner LKML-Reference: <20091117070638.GA27691@alberich.amd.com> [ v2: changed BUG_ON() to WARN_ON() ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 26e33bd8485b..63123d902103 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -34,6 +34,7 @@ MODULE_LICENSE("GPL v2"); #define UCODE_UCODE_TYPE 0x00000001 const struct firmware *firmware; +static int supported_cpu; struct equiv_cpu_entry { u32 installed_cpu; @@ -73,15 +74,12 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { - struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - memset(csig, 0, sizeof(*csig)); - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " - "supported\n", cpu, c->x86); + if (!supported_cpu) return -1; - } + + memset(csig, 0, sizeof(*csig)); rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; @@ -331,6 +329,17 @@ static void microcode_fini_cpu_amd(int cpu) void init_microcode_amd(struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; + struct cpuinfo_x86 *c = &boot_cpu_data; + + WARN_ON(c->x86_vendor != X86_VENDOR_AMD); + + if (c->x86 < 0x10) { + pr_warning("microcode: AMD CPU family 0x%x not supported\n", + c->x86); + return; + } + supported_cpu = 1; + if (request_firmware(&firmware, fw_name, device)) pr_err("microcode: failed to load file %s\n", fw_name); } -- cgit v1.2.1 From 0696b711e4be45fa104c12329f617beb29c03f78 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Tue, 17 Nov 2009 13:49:50 +0800 Subject: timekeeping: Fix clock_gettime vsyscall time warp Since commit 0a544198 "timekeeping: Move NTP adjusted clock multiplier to struct timekeeper" the clock multiplier of vsyscall is updated with the unmodified clock multiplier of the clock source and not with the NTP adjusted multiplier of the timekeeper. This causes user space observerable time warps: new CLOCK-warp maximum: 120 nsecs, 00000025c337c537 -> 00000025c337c4bf Add a new argument "mult" to update_vsyscall() and hand in the timekeeping internal NTP adjusted multiplier. Signed-off-by: Lin Ming Cc: "Zhang Yanmin" Cc: Martin Schwidefsky Cc: Benjamin Herrenschmidt Cc: Tony Luck LKML-Reference: <1258436990.17765.83.camel@minggr.sh.intel.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8cb4974ff599..62f39d79b775 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -73,7 +73,8 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, + u32 mult) { unsigned long flags; @@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.clock.vread = clock->vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; - vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; -- cgit v1.2.1 From f7f3cad06080f14f60b1453af94463ff68ea2739 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Sat, 24 Oct 2009 17:25:38 +0200 Subject: [CPUFREQ] longhaul: select Longhaul version 2 for capable CPUs There is a typo in the longhaul detection code so only Longhaul v1 or Longhaul v3 is selected. The Longhaul v2 is not selected even for CPUs which are capable of. Tested on PCChips Giga Pro board. Frequency changes work and the Longhaul v2 detects that the board is not capable of changing CPU voltage. Signed-off-by: Krzysztof Helt Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/longhaul.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index ce2ed3e4aad9..cabd2fa3fc93 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); break; case 1 ... 15: - longhaul_version = TYPE_LONGHAUL_V1; + longhaul_version = TYPE_LONGHAUL_V2; if (c->x86_mask < 8) { cpu_model = CPU_SAMUEL2; cpuname = "C3 'Samuel 2' [C5B]"; -- cgit v1.2.1 From c53614ec17fe6296a696aa4ac71a799814bb50c1 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 6 Oct 2009 17:36:53 +0200 Subject: [CPUFREQ] powernow-k8: Fix test in get_transition_latency() Not makes it a bool before the comparison. Signed-off-by: Roel Kluin Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 6394aa5c7985..3f12dabeab52 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1022,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data) * set it to 1 to avoid problems in the future. * For all others it's a BIOS bug. */ - if (!boot_cpu_data.x86 == 0x11) + if (boot_cpu_data.x86 != 0x11) printk(KERN_ERR FW_WARN PFX "Invalid zero transition " "latency\n"); max_latency = 1; -- cgit v1.2.1 From 293afe44d75abce4252db76cbb303a7de4297ce1 Mon Sep 17 00:00:00 2001 From: John Villalovos Date: Fri, 25 Sep 2009 13:30:08 -0400 Subject: [CPUFREQ] acpi-cpufreq: blacklist Intel 0f68: Fix HT detection and put in notification message Removing the SMT/HT check, since the Errata doesn't mention Hyper-Threading. Adding in a printk, so that the user knows why acpi-cpufreq refuses to load. Also, once system is blacklisted, don't repeat checks to see if blacklisted. This also causes the message to only be printed once, rather than for each CPU. Signed-off-by: John L. Villalovos Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 7d5c3b0ea8da..8b581d3905cb 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -526,15 +526,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = { static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) { - /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf + /* Intel Xeon Processor 7100 Series Specification Update + * http://www.intel.com/Assets/PDF/specupdate/314554.pdf * AL30: A Machine Check Exception (MCE) Occurring during an * Enhanced Intel SpeedStep Technology Ratio Change May Cause - * Both Processor Cores to Lock Up when HT is enabled*/ + * Both Processor Cores to Lock Up. */ if (c->x86_vendor == X86_VENDOR_INTEL) { if ((c->x86 == 15) && (c->x86_model == 6) && - (c->x86_mask == 8) && smt_capable()) + (c->x86_mask == 8)) { + printk(KERN_INFO "acpi-cpufreq: Intel(R) " + "Xeon(R) 7100 Errata AL30, processors may " + "lock up on frequency changes: disabling " + "acpi-cpufreq.\n"); return -ENODEV; + } } return 0; } @@ -549,13 +555,18 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) unsigned int result = 0; struct cpuinfo_x86 *c = &cpu_data(policy->cpu); struct acpi_processor_performance *perf; +#ifdef CONFIG_SMP + static int blacklisted; +#endif dprintk("acpi_cpufreq_cpu_init\n"); #ifdef CONFIG_SMP - result = acpi_cpufreq_blacklist(c); - if (result) - return result; + if (blacklisted) + return blacklisted; + blacklisted = acpi_cpufreq_blacklist(c); + if (blacklisted) + return blacklisted; #endif data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); -- cgit v1.2.1 From 8dca15e40889e5d5e9655b03ba79c26200f760ce Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 2 Nov 2009 23:35:30 -0800 Subject: [CPUFREQ] speedstep-ich: fix error caused by 394122ab144dae4b276d74644a2f11c44a60ac5c "[CPUFREQ] cpumask: avoid playing with cpus_allowed in speedstep-ich.c" changed the code to mistakenly pass the current cpu as the "processor" argument of speedstep_get_frequency(), whereas it should be the type of the processor. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=14340 Based on a patch by Dave Mueller. Signed-off-by: Rusty Russell Acked-by: Dominik Brodowski Reported-by: Dave Mueller Cc: Signed-off-by: Andrew Morton Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 6911e91fb4f6..3ae5a7a3a500 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void) return 0; } -struct get_freq_data { - unsigned int speed; - unsigned int processor; -}; - -static void get_freq_data(void *_data) +static void get_freq_data(void *_speed) { - struct get_freq_data *data = _data; + unsigned int *speed = _speed; - data->speed = speedstep_get_frequency(data->processor); + *speed = speedstep_get_frequency(speedstep_processor); } static unsigned int speedstep_get(unsigned int cpu) { - struct get_freq_data data = { .processor = cpu }; + unsigned int speed; /* You're supposed to ensure CPU is online. */ - if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) + if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) BUG(); - dprintk("detected %u kHz as current frequency\n", data.speed); - return data.speed; + dprintk("detected %u kHz as current frequency\n", speed); + return speed; } /** -- cgit v1.2.1 From 6dbfe5a57db3564adf7b2a65068e40f1b4a0d2db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 18:27:18 +0100 Subject: x86: Fixup last users of irq_chip->typename The typename member of struct irq_chip was kept for migration purposes and is obsolete since more than 2 years. Fix up the leftovers. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/visws_quirks.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index f068553a1b17..f084dfd97fcb 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -486,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq) } static struct irq_chip cobalt_irq_type = { - .typename = "Cobalt-APIC", + .name = "Cobalt-APIC", .startup = startup_cobalt_irq, .shutdown = disable_cobalt_irq, .enable = enable_cobalt_irq, @@ -523,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq) } static struct irq_chip piix4_master_irq_type = { - .typename = "PIIX4-master", + .name = "PIIX4-master", .startup = startup_piix4_master_irq, .ack = ack_cobalt_irq, .end = end_piix4_master_irq, @@ -531,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = { static struct irq_chip piix4_virtual_irq_type = { - .typename = "PIIX4-virtual", + .name = "PIIX4-virtual", .shutdown = disable_8259A_irq, .enable = enable_8259A_irq, .disable = disable_8259A_irq, -- cgit v1.2.1 From 070e5c3f9989a72076e83fdd5ede3f0f3eb17264 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2009 12:27:47 +0100 Subject: x86: vmiclock: Fix printk format clockevents.mult became u32. Fix the printk format. Pointed-out-by: Randy Dunlap Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vmiclock_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 611b9e2360d3..74c92bb194df 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void) evt->min_delta_ns = clockevent_delta2ns(1, evt); evt->cpumask = cpumask_of(cpu); - printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", + printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n", evt->name, evt->mult, evt->shift); clockevents_register_device(evt); } -- cgit v1.2.1 From e670761f12f4069d204f433bf547d9c679a4fd05 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Nov 2009 00:23:37 -0800 Subject: x86: apic: Remove not needed #ifdef Suresh made dmar_table_init() already have that protection. Signed-off-by: Yinghai Lu LKML-Reference: <4B07A739.3030104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4c689f45b238..ad8c75b9e453 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1377,14 +1377,11 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; int ret, x2apic_enabled = 0; - int dmar_table_init_ret = 0; + int dmar_table_init_ret; -#ifdef CONFIG_INTR_REMAP dmar_table_init_ret = dmar_table_init(); - if (dmar_table_init_ret) - pr_debug("dmar_table_init() failed with %d:\n", - dmar_table_init_ret); -#endif + if (dmar_table_init_ret && !x2apic_supported()) + return; ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { -- cgit v1.2.1 From 37ef2a3029fde884808ff1b369677abc7dd9a79a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 21 Nov 2009 00:23:37 -0800 Subject: x86: Re-get cfg_new in case reuse/move irq_desc When irq_desc is moved, we need to make sure to use the right cfg_new. Signed-off-by: Yinghai Lu LKML-Reference: <4B07A739.3030104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ff237199fa23..085e60e303cf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3186,6 +3186,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) continue; desc_new = move_irq_desc(desc_new, node); + cfg_new = desc_new->chip_data; if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; -- cgit v1.2.1 From 6e3d8330ae2c4b2c11a9577a0130d2ecda1c610d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Nov 2009 10:19:20 +0100 Subject: perf events: Do not generate function trace entries in perf code Decreases perf overhead when function tracing is enabled, by about 50%. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 68537e957a9b..1d2cb383410e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -5,6 +5,7 @@ # Don't trace early stages of a secondary CPU boot ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg endif # Make sure load_percpu_segment has no stackprotector -- cgit v1.2.1 From 9f800de38b05d84809e89f16671d636a140eede7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 12:45:25 +0100 Subject: x86/amd-iommu: un__init iommu_setup_msi This function may be called on the resume path and can not be dropped after booting. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0d4581e602a4..72bdbdac9b48 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -926,7 +926,7 @@ static int __init init_iommu_all(struct acpi_table_header *table) * ****************************************************************************/ -static int __init iommu_setup_msi(struct amd_iommu *iommu) +static int iommu_setup_msi(struct amd_iommu *iommu) { int r; -- cgit v1.2.1 From be831297716036de5b24308447ecb69f1706a846 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 12:50:00 +0100 Subject: x86/amd-iommu: attach devices to pre-allocated domains early For some devices the ACPI table may define unity map requirements which must me met when the IOMMU is enabled. So we need to attach devices to their domains as early as possible so that these mappings are in place when needed. This patch assigns the domains right after they are allocated. Otherwise this can result in I/O page faults before a driver binds to a device and BIOS is still using it. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 093bd526c949..b74b21247584 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2047,10 +2047,10 @@ static void prealloc_protection_domains(void) struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - u16 devid; + u16 devid, __devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = calc_devid(dev->bus->number, dev->devfn); + __devid = devid = calc_devid(dev->bus->number, dev->devfn); if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; @@ -2065,6 +2065,10 @@ static void prealloc_protection_domains(void) init_unity_mappings_for_device(dma_dom, devid); dma_dom->target_dev = devid; + attach_device(iommu, &dma_dom->domain, devid); + if (__devid != devid) + attach_device(iommu, &dma_dom->domain, __devid); + list_add_tail(&dma_dom->list, &iommu_pd_list); } } -- cgit v1.2.1 From ba6909b719a5ccc0c8100d2895bb7ff557b2eeae Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Mon, 23 Nov 2009 21:17:13 +0530 Subject: hw-breakpoint: Attribute authorship of hw-breakpoint related files Attribute authorship to developers of hw-breakpoint related files. Signed-off-by: K.Prasad Cc: Alan Stern Cc: Frederic Weisbecker LKML-Reference: <20091123154713.GA5593@in.ibm.com> [ v2: moved it to latest -tip ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 752daebe91c6..4d267fb77828 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -16,6 +16,10 @@ * Copyright (C) 2007 Alan Stern * Copyright (C) 2009 IBM Corporation * Copyright (C) 2009 Frederic Weisbecker + * + * Authors: Alan Stern + * K.Prasad + * Frederic Weisbecker */ /* -- cgit v1.2.1 From 0444c9bd0cf4e0eb946a7fcaf34765accfa9404a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 20 Nov 2009 14:03:05 +0000 Subject: x86: Tighten conditionals on MCE related statistics irq_thermal_count is only being maintained when X86_THERMAL_VECTOR, and both X86_THERMAL_VECTOR and X86_MCE_THRESHOLD don't need extra wrapping in X86_MCE conditionals. Signed-off-by: Jan Beulich Cc: Hidetoshi Seto Cc: Yong Wang Cc: Suresh Siddha Cc: Andi Kleen Cc: Borislav Petkov Cc: Arjan van de Ven LKML-Reference: <4B06AFA902000078000211F8@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 04bbd5278568..19212cb01558 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); seq_printf(p, " TLB shootdowns\n"); #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR seq_printf(p, "%*s: ", prec, "TRM"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); seq_printf(p, " Thermal event interrupts\n"); -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD seq_printf(p, "%*s: ", prec, "THR"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); seq_printf(p, " Threshold APIC interrupts\n"); -# endif #endif #ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); @@ -194,11 +194,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_call_count; sum += irq_stats(cpu)->irq_tlb_count; #endif -#ifdef CONFIG_X86_MCE +#ifdef CONFIG_X86_THERMAL_VECTOR sum += irq_stats(cpu)->irq_thermal_count; -# ifdef CONFIG_X86_MCE_THRESHOLD +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD sum += irq_stats(cpu)->irq_threshold_count; -# endif #endif #ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); -- cgit v1.2.1 From 581f202bcd60acbc3af1f5faa429e570c512f8a3 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Fri, 20 Nov 2009 15:48:26 -0600 Subject: x86: UV RTC: Always enable RTC clocksource Always enable the RTC clocksource on UV systems. Signed-off-by: Dimitri Sivanich LKML-Reference: <20091120214826.GA20016@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 3da7b1d8bfd3..3c84aa001c11 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -74,7 +74,6 @@ struct uv_rtc_timer_head { */ static struct uv_rtc_timer_head **blade_info __read_mostly; -static int uv_rtc_enable; static int uv_rtc_evt_enable; /* @@ -335,14 +334,6 @@ static void uv_rtc_interrupt(void) ced->event_handler(ced); } -static int __init uv_enable_rtc(char *str) -{ - uv_rtc_enable = 1; - - return 1; -} -__setup("uvrtc", uv_enable_rtc); - static int __init uv_enable_evt_rtc(char *str) { uv_rtc_evt_enable = 1; @@ -364,12 +355,16 @@ static __init int uv_rtc_setup_clock(void) { int rc; - if (!uv_rtc_enable || !is_uv_system() || x86_platform_ipi_callback) + if (!is_uv_system()) return -ENODEV; clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, clocksource_uv.shift); + /* If single blade, prefer tsc */ + if (uv_num_possible_blades() == 1) + clocksource_uv.rating = 250; + rc = clocksource_register(&clocksource_uv); if (rc) printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); @@ -377,7 +372,7 @@ static __init int uv_rtc_setup_clock(void) printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n", sn_rtc_cycles_per_second/(unsigned long)1E6); - if (rc || !uv_rtc_evt_enable) + if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback) return rc; /* Setup and register clockevents */ -- cgit v1.2.1 From 27c13ecec4d8856687b50b959e1146845b478f95 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 21 Nov 2009 14:01:45 +0100 Subject: x86, cpu: mv display_cacheinfo -> cpu_detect_cache_sizes display_cacheinfo() doesn't display anything anymore and it is used to detect CPU cache sizes. Rename it accordingly. Signed-off-by: Borislav Petkov LKML-Reference: <20091121130145.GA31357@liondog.tnic> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/centaur.c | 2 +- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/cpu/cpu.h | 2 +- arch/x86/kernel/cpu/cyrix.c | 2 +- arch/x86/kernel/cpu/transmeta.c | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c910a716a71c..7128b3799cec 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -535,7 +535,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } - display_cacheinfo(c); + cpu_detect_cache_sizes(c); /* Multi core CPU? */ if (c->extended_cpuid_level >= 0x80000008) { diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index c95e831bb095..e58d978e0758 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_REP_GOOD); } - display_cacheinfo(c); + cpu_detect_cache_sizes(c); } enum { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 61242a56c2d6..9bf845dc8055 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void) static void __cpuinit default_init(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 - display_cacheinfo(c); + cpu_detect_cache_sizes(c); #else /* Not much we can do here... */ /* Check if at least it has cpuid */ @@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c) } } -void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) +void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 6de9a908e400..3624e8a0f71b 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -32,6 +32,6 @@ struct cpu_dev { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; -extern void display_cacheinfo(struct cpuinfo_x86 *c); +extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 19807b89f058..4fbd384fb645 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c) /* Handle the GX (Formally known as the GX2) */ if (c->x86 == 5 && c->x86_model == 5) - display_cacheinfo(c); + cpu_detect_cache_sizes(c); else init_cyrix(c); } diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index bb62b3e5caad..28000743bbb0 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) early_init_transmeta(c); - display_cacheinfo(c); + cpu_detect_cache_sizes(c); /* Print CMS and CPU revision */ max = cpuid_eax(0x80860000); -- cgit v1.2.1 From 1261a02a0c0ab8e643125705f0d1d83e5090e4d1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 24 Nov 2009 05:27:18 -0800 Subject: perf_events, x86: Fix validate_event bug The validate_event() was failing on valid event combinations. The function was assuming that if x86_schedule_event() returned 0, it meant error. But x86_schedule_event() returns the counter index and 0 is a perfectly valid value. An error is returned if the function returns a negative value. Furthermore, validate_event() was also failing for event groups because the event->pmu was not set until after hw_perf_event_init(). Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: paulus@samba.org Cc: perfmon2-devel@lists.sourceforge.net Cc: eranian@gmail.com LKML-Reference: <4b0bdf36.1818d00a.07cc.25ae@mx.google.com> Signed-off-by: Ingo Molnar -- arch/x86/kernel/cpu/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index bd8743024204..c1bbed1021d9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2229,10 +2229,10 @@ validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) { struct hw_perf_event fake_event = event->hw; - if (event->pmu != &pmu) + if (event->pmu && event->pmu != &pmu) return 0; - return x86_schedule_event(cpuc, &fake_event); + return x86_schedule_event(cpuc, &fake_event) >= 0; } static int validate_group(struct perf_event *event) -- cgit v1.2.1 From d77b81974521c82fa6fda38dfff1b491dcc62a32 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 24 Nov 2009 16:53:00 +0100 Subject: [CPUFREQ] Enable ACPI PDC handshake for VIA/Centaur CPUs In commit 0de51088e6a82bc8413d3ca9e28bbca2788b5b53, we introduced the use of acpi-cpufreq on VIA/Centaur CPU's by removing a vendor check for VENDOR_INTEL. However, as it turns out, at least the Nano CPU's also need the PDC (processor driver capabilities) handshake in order to activate the methods required for acpi-cpufreq. Since arch_acpi_processor_init_pdc() contains another vendor check for Intel, the PDC is not initialized on VIA CPU's. The resulting behavior of a current mainline kernel on such systems is: acpi-cpufreq loads and it indicates CPU frequency changes. However, the CPU stays at a single frequency This trivial patch ensures that init_intel_pdc() is called on Intel and VIA/Centaur CPU's alike. Signed-off-by: Harald Welte Signed-off-by: Dave Jones --- arch/x86/kernel/acpi/processor.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index d296f4a195c9..d85d1b2432ba 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -79,7 +79,8 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) struct cpuinfo_x86 *c = &cpu_data(pr->id); pr->pdc = NULL; - if (c->x86_vendor == X86_VENDOR_INTEL) + if (c->x86_vendor == X86_VENDOR_INTEL || + c->x86_vendor == X86_VENDOR_CENTAUR) init_intel_pdc(pr, c); return; -- cgit v1.2.1 From b8cbe7e82ec8b55d7bbdde66fc69e788fde00dc6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 3 Nov 2009 14:57:56 +1030 Subject: [CPUFREQ] cpumask: don't put a cpumask on the stack in x86...cpufreq/powernow-k8.c It's still mugging the current process's cpumask, but as comment in 1ff6e97f1d says, it's not a trivial fix. So, at least we can use a cpumask_var_t to do the Wrong Thing the Right Way :) Signed-off-by: Rusty Russell To: cpufreq@vger.kernel.org Cc: Mark Langsdorf Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 3f12dabeab52..f30d25383940 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1118,7 +1118,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { - cpumask_t oldmask; + cpumask_var_t oldmask; struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); u32 checkfid; u32 checkvid; @@ -1131,9 +1131,13 @@ static int powernowk8_target(struct cpufreq_policy *pol, checkfid = data->currfid; checkvid = data->currvid; - /* only run on specific CPU from here on */ - oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); + /* only run on specific CPU from here on. */ + /* This is poor form: use a workqueue or smp_call_function_single */ + if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_copy(oldmask, tsk_cpumask(current)); + set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1193,7 +1197,8 @@ static int powernowk8_target(struct cpufreq_policy *pol, ret = 0; err_out: - set_cpus_allowed_ptr(current, &oldmask); + set_cpus_allowed_ptr(current, oldmask); + free_cpumask_var(oldmask); return ret; } -- cgit v1.2.1 From db2820dd5445a44b4726f15a2bc89b9ded2503eb Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Sun, 25 Oct 2009 19:45:57 +0100 Subject: [CPUFREQ] powernow-k6: set transition latency value so ondemand governor can be used Set the transition latency to value smaller than CPUFREQ_ETERNAL so governors other than "performance" work (like the "ondemand" one). The value is found in "AMD PowerNow! Technology Platform Design Guide for Embedded Processors" dated December 2000 (AMD doc #24267A). There is the answer to one of FAQs on page 40 which states that suggested complete transition period is 200 us. Tested on K6-2+ CPU with K6-3 core (model 13, stepping 4). Signed-off-by: Krzysztof Helt Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index f10dea409f40..cb01dac267d3 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -164,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) } /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + policy->cpuinfo.transition_latency = 200000; policy->cur = busfreq * max_multiplier; result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); -- cgit v1.2.1 From 1cce76c2ac60df40b02bf747982fb3f00e68f50a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 17 Nov 2009 14:39:53 -0800 Subject: [CPUFREQ] use an enum for speedstep processor identification The "unsigned int processor" everywhere confused Rusty, leading to breakage when he passed in smp_processor_id(). Signed-off-by: Rusty Russell Acked-by: Dominik Brodowski Signed-off-by: Andrew Morton Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 2 +- arch/x86/kernel/cpu/cpufreq/speedstep-lib.c | 6 +++--- arch/x86/kernel/cpu/cpufreq/speedstep-lib.h | 24 ++++++++++++------------ arch/x86/kernel/cpu/cpufreq/speedstep-smi.c | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 3ae5a7a3a500..2ce8e0b5cc54 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev; /* speedstep_processor */ -static unsigned int speedstep_processor; +static enum speedstep_processor speedstep_processor; static u32 pmbase; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index f4c290b8482f..ad0083abfa23 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -34,7 +34,7 @@ static int relaxed_check; * GET PROCESSOR CORE SPEED IN KHZ * *********************************************************************/ -static unsigned int pentium3_get_frequency(unsigned int processor) +static unsigned int pentium3_get_frequency(enum speedstep_processor processor) { /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ struct { @@ -227,7 +227,7 @@ static unsigned int pentium4_get_frequency(void) /* Warning: may get called from smp_call_function_single. */ -unsigned int speedstep_get_frequency(unsigned int processor) +unsigned int speedstep_get_frequency(enum speedstep_processor processor) { switch (processor) { case SPEEDSTEP_CPU_PCORE: @@ -380,7 +380,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor); * DETECT SPEEDSTEP SPEEDS * *********************************************************************/ -unsigned int speedstep_get_freqs(unsigned int processor, +unsigned int speedstep_get_freqs(enum speedstep_processor processor, unsigned int *low_speed, unsigned int *high_speed, unsigned int *transition_latency, diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h index 2b6c04e5a304..70d9cea1219d 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h @@ -11,18 +11,18 @@ /* processors */ - -#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */ -#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */ -#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */ -#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */ - +enum speedstep_processor { + SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */ + SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */ + SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */ + SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */ /* the following processors are not speedstep-capable and are not auto-detected * in speedstep_detect_processor(). However, their speed can be detected using * the speedstep_get_frequency() call. */ -#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */ -#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */ -#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */ + SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */ + SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */ + SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */ +}; /* speedstep states -- only two of them */ @@ -31,10 +31,10 @@ /* detect a speedstep-capable processor */ -extern unsigned int speedstep_detect_processor (void); +extern enum speedstep_processor speedstep_detect_processor(void); /* detect the current speed (in khz) of the processor */ -extern unsigned int speedstep_get_frequency(unsigned int processor); +extern unsigned int speedstep_get_frequency(enum speedstep_processor processor); /* detect the low and high speeds of the processor. The callback @@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor); * SPEEDSTEP_LOW; the second argument is zero so that no * cpufreq_notify_transition calls are initiated. */ -extern unsigned int speedstep_get_freqs(unsigned int processor, +extern unsigned int speedstep_get_freqs(enum speedstep_processor processor, unsigned int *low_speed, unsigned int *high_speed, unsigned int *transition_latency, diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index befea088e4f5..04d73c114e49 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -35,7 +35,7 @@ static int smi_cmd; static unsigned int smi_sig; /* info about the processor */ -static unsigned int speedstep_processor; +static enum speedstep_processor speedstep_processor; /* * There are only two frequency states for each processor. Values -- cgit v1.2.1 From e2f74f355e9e2914483db10c05d70e69e0b7ae04 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Thu, 19 Nov 2009 12:31:01 +0100 Subject: [ACPI/CPUFREQ] Introduce bios_limit per cpu cpufreq sysfs interface This interface is mainly intended (and implemented) for ACPI _PPC BIOS frequency limitations, but other cpufreq drivers can also use it for similar use-cases. Why is this needed: Currently it's not obvious why cpufreq got limited. People see cpufreq/scaling_max_freq reduced, but this could have happened by: - any userspace prog writing to scaling_max_freq - thermal limitations - hardware (_PPC in ACPI case) limitiations Therefore export bios_limit (in kHz) to: - Point the user that it's the BIOS (broken or intended) which limits frequency - Export it as a sysfs interface for userspace progs. While this was a rarely used feature on laptops, there will appear more and more server implemenations providing "Green IT" features like allowing the service processor to limit the frequency. People want to know about HW/BIOS frequency limitations. All ACPI P-state driven cpufreq drivers are covered with this patch: - powernow-k8 - powernow-k7 - acpi-cpufreq Tested with a patched DSDT which limits the first two cores (_PPC returns 1) via _PPC, exposed by bios_limit: # echo 2200000 >cpu2/cpufreq/scaling_max_freq # cat cpu*/cpufreq/scaling_max_freq 2600000 2600000 2200000 2200000 # #scaling_max_freq shows general user/thermal/BIOS limitations # cat cpu*/cpufreq/bios_limit 2600000 2600000 2800000 2800000 # #bios_limit only shows the HW/BIOS limitation CC: Pallipadi Venkatesh CC: Len Brown CC: davej@codemonkey.org.uk CC: linux@dominikbrodowski.net Signed-off-by: Thomas Renninger Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 17 +++++++++-------- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 19 +++++++++++-------- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 17 +++++++++-------- 3 files changed, 29 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 8b581d3905cb..d2e7c77c1ea4 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -764,14 +764,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = { }; static struct cpufreq_driver acpi_cpufreq_driver = { - .verify = acpi_cpufreq_verify, - .target = acpi_cpufreq_target, - .init = acpi_cpufreq_cpu_init, - .exit = acpi_cpufreq_cpu_exit, - .resume = acpi_cpufreq_resume, - .name = "acpi-cpufreq", - .owner = THIS_MODULE, - .attr = acpi_cpufreq_attr, + .verify = acpi_cpufreq_verify, + .target = acpi_cpufreq_target, + .bios_limit = acpi_processor_get_bios_limit, + .init = acpi_cpufreq_cpu_init, + .exit = acpi_cpufreq_cpu_exit, + .resume = acpi_cpufreq_resume, + .name = "acpi-cpufreq", + .owner = THIS_MODULE, + .attr = acpi_cpufreq_attr, }; static int __init acpi_cpufreq_init(void) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index d47c775eb0ab..9a97116f89e5 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = { }; static struct cpufreq_driver powernow_driver = { - .verify = powernow_verify, - .target = powernow_target, - .get = powernow_get, - .init = powernow_cpu_init, - .exit = powernow_cpu_exit, - .name = "powernow-k7", - .owner = THIS_MODULE, - .attr = powernow_table_attr, + .verify = powernow_verify, + .target = powernow_target, + .get = powernow_get, +#ifdef CONFIG_X86_POWERNOW_K7_ACPI + .bios_limit = acpi_processor_get_bios_limit, +#endif + .init = powernow_cpu_init, + .exit = powernow_cpu_exit, + .name = "powernow-k7", + .owner = THIS_MODULE, + .attr = powernow_table_attr, }; static int __init powernow_init(void) diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index f30d25383940..a9df9441a9a2 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1398,14 +1398,15 @@ static struct freq_attr *powernow_k8_attr[] = { }; static struct cpufreq_driver cpufreq_amd64_driver = { - .verify = powernowk8_verify, - .target = powernowk8_target, - .init = powernowk8_cpu_init, - .exit = __devexit_p(powernowk8_cpu_exit), - .get = powernowk8_get, - .name = "powernow-k8", - .owner = THIS_MODULE, - .attr = powernow_k8_attr, + .verify = powernowk8_verify, + .target = powernowk8_target, + .bios_limit = acpi_processor_get_bios_limit, + .init = powernowk8_cpu_init, + .exit = __devexit_p(powernowk8_cpu_exit), + .get = powernowk8_get, + .name = "powernow-k8", + .owner = THIS_MODULE, + .attr = powernow_k8_attr, }; /* driver entry point for init */ -- cgit v1.2.1 From 2263576cfc6e8f6ab038126c3254404b9fcb1c33 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Fri, 13 Nov 2009 10:06:08 +0800 Subject: ACPICA: Add post-order callback to acpi_walk_namespace The existing interface only has a pre-order callback. This change adds an additional parameter for a post-order callback which will be more useful for bus scans. ACPICA BZ 779. Also update the external calls to acpi_walk_namespace. http://www.acpica.org/bugzilla/show_bug.cgi?id=779 Signed-off-by: Lin Ming Signed-off-by: Bob Moore Signed-off-by: Len Brown --- arch/x86/kernel/cpu/cpufreq/longhaul.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index cabd2fa3fc93..7e7eea4f8261 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) /* Find ACPI data for processor */ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, &longhaul_walk_callback, + ACPI_UINT32_MAX, &longhaul_walk_callback, NULL, NULL, (void *)&pr); /* Check ACPI support for C3 state */ -- cgit v1.2.1 From 273bee27fa9f79d94b78c83506016f2e41e78983 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 25 Nov 2009 08:46:28 +0900 Subject: x86: Fix iommu=soft boot option iommu=soft boot option forces the kernel to use swiotlb. ( This has the side-effect of enabling the swiotlb over the GART if this boot option is provided. This is the desired behavior of the swiotlb boot option and works like that for all other hw-IOMMU drivers. ) Signed-off-by: FUJITA Tomonori Cc: yinghai@kernel.org LKML-Reference: <20091125084611O.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index e36e71daa44c..e3c0a66b9e77 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -50,6 +50,8 @@ static struct dma_map_ops swiotlb_dma_ops = { */ int __init pci_swiotlb_init(void) { + int use_swiotlb = swiotlb | swiotlb_force; + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 if (!no_iommu && max_pfn > MAX_DMA32_PFN) @@ -63,5 +65,5 @@ int __init pci_swiotlb_init(void) dma_ops = &swiotlb_dma_ops; } - return swiotlb_force; + return use_swiotlb; } -- cgit v1.2.1 From 28b4e0d86acf59ae3bc422921138a4958458326e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Nov 2009 22:24:44 +0900 Subject: x86: Rename global percpu symbol dr7 to cpu_dr7 Percpu symbols now occupy the same namespace as other global symbols and as such short global symbols without subsystem prefix tend to collide with local variables. dr7 percpu variable used by x86 was hit by this. Rename it to cpu_dr7. The rename also makes it more consistent with its fellow cpu_debugreg percpu variable. Signed-off-by: Tejun Heo Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Rusty Russell Cc: Christoph Lameter Cc: Linus Torvalds , Cc: Andrew Morton LKML-Reference: <20091125115856.GA17856@elte.hu> Signed-off-by: Ingo Molnar Reported-by: Stephen Rothwell --- arch/x86/kernel/hw_breakpoint.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 4d267fb77828..92ea5aad0b5c 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -46,8 +46,8 @@ #include /* Per cpu debug control register value */ -DEFINE_PER_CPU(unsigned long, dr7); -EXPORT_PER_CPU_SYMBOL(dr7); +DEFINE_PER_CPU(unsigned long, cpu_dr7); +EXPORT_PER_CPU_SYMBOL(cpu_dr7); /* Per cpu debug address registers values */ static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); @@ -118,7 +118,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) set_debugreg(info->address, i); __get_cpu_var(cpu_debugreg[i]) = info->address; - dr7 = &__get_cpu_var(dr7); + dr7 = &__get_cpu_var(cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); @@ -153,7 +153,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) return; - dr7 = &__get_cpu_var(dr7); + dr7 = &__get_cpu_var(cpu_dr7); *dr7 &= ~encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); @@ -437,7 +437,7 @@ void hw_breakpoint_restore(void) set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); set_debugreg(current->thread.debugreg6, 6); - set_debugreg(__get_cpu_var(dr7), 7); + set_debugreg(__get_cpu_var(cpu_dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); -- cgit v1.2.1 From b803090615ccec669681ff85ce28671e7bfefa3d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Nov 2009 08:17:31 +0100 Subject: x86: dumpstack: Clean up the x86_stack_ids[][] initalization and other details Make the initialization more readable, plus tidy up a few small visual details as well. No change in functionality. LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 9 ++++----- arch/x86/kernel/dumpstack_64.c | 25 +++++++++++++------------ 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f7dd2a7c3bf4..e0ed4c7abb62 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -10,9 +10,9 @@ #include #include #include +#include #include #include -#include #include @@ -35,6 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { unsigned long dummy; + stack = &dummy; if (task && task != current) stack = (unsigned long *)task->thread.sp; @@ -57,8 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, - data, NULL, &graph); + bp = print_context_stack(context, stack, bp, ops, data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) @@ -72,7 +72,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -156,4 +156,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } - diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a071e6be177e..cfec478a4e42 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -10,26 +10,28 @@ #include #include #include +#include #include #include -#include #include #include "dumpstack.h" +#define N_EXCEPTION_STACKS_END \ + (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) static char x86_stack_ids[][8] = { - [DEBUG_STACK - 1] = "#DB", - [NMI_STACK - 1] = "NMI", - [DOUBLEFAULT_STACK - 1] = "#DF", - [STACKFAULT_STACK - 1] = "#SS", - [MCE_STACK - 1] = "#MC", + [ DEBUG_STACK-1 ] = "#DB", + [ NMI_STACK-1 ] = "NMI", + [ DOUBLEFAULT_STACK-1 ] = "#DF", + [ STACKFAULT_STACK-1 ] = "#SS", + [ MCE_STACK-1 ] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... - N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" + [ N_EXCEPTION_STACKS ... + N_EXCEPTION_STACKS_END ] = "#DB[?]" #endif - }; +}; int x86_is_stack_id(int id, char *name) { @@ -37,7 +39,7 @@ int x86_is_stack_id(int id, char *name) } static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) + unsigned *usedp, char **idp) { unsigned k; @@ -202,7 +204,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -303,4 +305,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } - -- cgit v1.2.1 From 67f2de0bf9141dd9fe9189d0caaa28d7ad21a523 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 26 Nov 2009 08:29:10 +0100 Subject: x86: dumpstack, 64-bit: Disable preemption when walking the IRQ/exception stacks This warning: [ 847.140022] rb_producer D 0000000000000000 5928 519 2 0x00000000 [ 847.203627] BUG: using smp_processor_id() in preemptible [00000000] code: khungtaskd/517 [ 847.207360] caller is show_stack_log_lvl+0x2e/0x241 [ 847.210364] Pid: 517, comm: khungtaskd Not tainted 2.6.32-rc8-tip+ #13761 [ 847.213395] Call Trace: [ 847.215847] [] debug_smp_processor_id+0x1f0/0x20a [ 847.216809] [] show_stack_log_lvl+0x2e/0x241 [ 847.220027] [] show_stack+0x1c/0x1e [ 847.223365] [] sched_show_task+0xe4/0xe9 [ 847.226694] [] check_hung_task+0x140/0x199 [ 847.230261] [] check_hung_uninterruptible_tasks+0x1b7/0x20f [ 847.233371] [] ? watchdog+0x0/0x50 [ 847.236683] [] watchdog+0x4e/0x50 [ 847.240034] [] kthread+0x97/0x9f [ 847.243372] [] child_rip+0xa/0x20 [ 847.246690] [] ? restore_args+0x0/0x30 [ 847.250019] [] ? _spin_lock+0xe/0x10 [ 847.253351] [] ? kthread+0x0/0x9f [ 847.256833] [] ? child_rip+0x0/0x20 Happens because on preempt-RCU, khungd calls show_stack() with preemption enabled. Make sure we are not preemptible while walking the IRQ and exception stacks on 64-bit. (32-bit stack dumping is preemption safe.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index cfec478a4e42..8e740934bd1f 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -206,19 +206,22 @@ void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl) { + unsigned long *irq_stack_end; + unsigned long *irq_stack; unsigned long *stack; + int cpu; int i; - const int cpu = smp_processor_id(); - unsigned long *irq_stack_end = - (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); - unsigned long *irq_stack = - (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); + + preempt_disable(); + cpu = smp_processor_id(); + + irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); + irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); /* - * debugging aid: "show_stack(NULL, NULL);" prints the - * back trace for this cpu. + * Debugging aid: "show_stack(NULL, NULL);" prints the + * back trace for this cpu: */ - if (sp == NULL) { if (task) sp = (unsigned long *)task->thread.sp; @@ -242,6 +245,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, printk(" %016lx", *stack++); touch_nmi_watchdog(); } + preempt_enable(); + printk("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -- cgit v1.2.1 From 605bfaee9078cd0b01d83402315389839ee4bb5c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 26 Nov 2009 05:35:42 +0100 Subject: hw-breakpoints: Simplify error handling in breakpoint creation requests This simplifies the error handling when we create a breakpoint. We don't need to check the NULL return value corner case anymore since we have improved perf_event_create_kernel_counter() to always return an error code in the failure case. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Steven Rostedt Cc: Prasad LKML-Reference: <1259210142-5714-3-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b25f8947ed7a..75e0cd847bd6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -657,10 +657,7 @@ restore: tsk, true); thread->ptrace_bps[i] = NULL; - if (!bp) { /* incorrect bp, or we have a bug in bp API */ - rc = -EINVAL; - break; - } + /* Incorrect bp, or we have a bug in bp API */ if (IS_ERR(bp)) { rc = PTR_ERR(bp); bp = NULL; @@ -729,9 +726,6 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, tsk, bp->attr.disabled); } - - if (!bp) - return -EIO; /* * CHECKME: the previous code returned -EIO if the addr wasn't a * valid task virtual addr. The new one will return -EINVAL in this -- cgit v1.2.1 From 2c31b7958fd21df9fa04e5c36cda0f063ac70b27 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 26 Nov 2009 06:04:38 +0100 Subject: x86/hw-breakpoints: Don't lose GE flag while disabling a breakpoint When we schedule out a breakpoint from the cpu, we also incidentally remove the "Global exact breakpoint" flag from the breakpoint control register. It makes us losing the fine grained precision about the origin of the instructions that may trigger breakpoint exceptions for the other breakpoints running in this cpu. Reported-by: Prasad Signed-off-by: Frederic Weisbecker LKML-Reference: <1259211878-6013-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/hw_breakpoint.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 92ea5aad0b5c..d42f65ac4927 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -59,21 +59,27 @@ static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); -/* - * Encode the length, type, Exact, and Enable bits for a particular breakpoint - * as stored in debug register 7. - */ -unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +static inline unsigned long +__encode_dr7(int drnum, unsigned int len, unsigned int type) { unsigned long bp_info; bp_info = (len | type) & 0xf; bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); - bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)) | - DR_GLOBAL_SLOWDOWN; + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); + return bp_info; } +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; +} + /* * Decode the length and type bits for a particular breakpoint as * stored in debug register 7. Return the "enabled" status. @@ -154,7 +160,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) return; dr7 = &__get_cpu_var(cpu_dr7); - *dr7 &= ~encode_dr7(i, info->len, info->type); + *dr7 &= ~__encode_dr7(i, info->len, info->type); set_debugreg(*dr7, 7); } -- cgit v1.2.1 From 9b3660a55a9052518c91cc7c62d89e22f3f6f490 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 17 Nov 2009 18:22:16 -0600 Subject: x86: Limit number of per cpu TSC sync messages Limit the number of per cpu TSC sync messages by only printing to the console if an error occurs, otherwise print as a DEBUG message. The info message "Skipping synchronization ..." is only printed after the last cpu has booted. Signed-off-by: Mike Travis Cc: Heiko Carstens Cc: Roland Dreier Cc: Randy Dunlap Cc: Tejun Heo Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Yinghai Lu Cc: David Rientjes Cc: Steven Rostedt Cc: Rusty Russell Cc: Hidetoshi Seto Cc: Jack Steiner Cc: Frederic Weisbecker LKML-Reference: <20091118002222.181053000@alcatraz.americas.sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_sync.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index f37930954d15..eed156851f5d 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); + if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) + pr_info( + "Skipped synchronization checks as TSC is reliable.\n"); return; } - pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); - /* * Reset it - in case this is a second bootup: */ @@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu) cpu_relax(); if (nr_warps) { - printk("\n"); + pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", + smp_processor_id(), cpu); pr_warning("Measured %Ld cycles TSC warp between CPUs, " "turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); } else { - printk(" passed.\n"); + pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + smp_processor_id(), cpu); } /* -- cgit v1.2.1 From 767df1bdd8cbff2c8c40c9ac8295bbdaa5fb24c4 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 26 Nov 2009 17:29:02 +0900 Subject: x86, mce: Add __cpuinit to hotplug callback functions The mce_disable_cpu() and mce_reenable_cpu() are called only from mce_cpu_callback() which is marked as __cpuinit. So these functions can be __cpuinit too. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen LKML-Reference: <4B0E3C4E.4090809@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5f277cad2ed7..0bcaa3875863 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1953,13 +1953,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) +static void __cpuinit mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { @@ -1970,7 +1971,7 @@ static void mce_disable_cpu(void *h) } } -static void mce_reenable_cpu(void *h) +static void __cpuinit mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; -- cgit v1.2.1 From 8ec6993d9f7d961014af970ded57542961fe9ad9 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 25 Nov 2009 11:17:36 -0500 Subject: x86, 64-bit: Set data segments to null after switching to 64-bit mode This prevents kernel threads from inheriting non-null segment selectors, and causing optimizations in __switch_to() to be ineffective. Signed-off-by: Brian Gerst Cc: Tim Blechmann Cc: Linus Torvalds Cc: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Jan Beulich LKML-Reference: <1259165856-3512-1-git-send-email-brgerst@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 26406601031c..17ba9ecf27c5 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -212,8 +212,8 @@ ENTRY(secondary_startup_64) */ lgdt early_gdt_descr(%rip) - /* set up data segments. actually 0 would do too */ - movl $__KERNEL_DS,%eax + /* set up data segments */ + xorl %eax,%eax movl %eax,%ds movl %eax,%ss movl %eax,%es -- cgit v1.2.1 From 918bc960dc630b1a79c0d2991a81985812ff69f5 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Wed, 25 Nov 2009 10:20:19 -0600 Subject: x86: SGI UV: Map low MMR ranges Explicitly mmap the UV chipset MMR address ranges used to access blade-local registers. Although these same MMRs are also mmaped at higher addresses, the low range is more convenient when accessing blade-local registers. The low range addresses always alias to the local blade regardless of the blade id. Signed-off-by: Jack Steiner LKML-Reference: <20091125162018.GA25445@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f5f5886a6b53..6d425490fb1f 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -409,6 +409,12 @@ static __init void map_mmioh_high(int max_pnode) map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); } +static __init void map_low_mmrs(void) +{ + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +} + static __init void uv_rtc_init(void) { long status; @@ -550,6 +556,8 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask; + map_low_mmrs(); + m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; -- cgit v1.2.1 From 5fa10b28e57f94a90535cfeafe89dcee9f47d540 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Nov 2009 04:55:53 +0100 Subject: hw-breakpoints: Use struct perf_event_attr to define user breakpoints In-kernel user breakpoints are created using functions in which we pass breakpoint parameters as individual variables: address, length and type. Although it fits well for x86, this just does not scale across archictectures that may support this api later as these may have more or different needs. Pass in a perf_event_attr structure instead because it is meant to evolve as much as possible into a generic hardware breakpoint parameter structure. Reported-by: K.Prasad Signed-off-by: Frederic Weisbecker LKML-Reference: <1259294154-5197-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 74 +++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 33 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 75e0cd847bd6..2941b32ea666 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -593,6 +593,34 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } +static struct perf_event * +ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + struct task_struct *tsk) +{ + int err; + int gen_len, gen_type; + DEFINE_BREAKPOINT_ATTR(attr); + + /* + * We shoud have at least an inactive breakpoint at this + * slot. It means the user is writing dr7 without having + * written the address register first + */ + if (!bp) + return ERR_PTR(-EINVAL); + + err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); + if (err) + return ERR_PTR(err); + + attr = bp->attr; + attr.bp_len = gen_len; + attr.bp_type = gen_type; + attr.disabled = 0; + + return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); +} + /* * Handle ptrace writes to debug register 7. */ @@ -603,7 +631,6 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) int i, orig_ret = 0, rc = 0; int enabled, second_pass = 0; unsigned len, type; - int gen_len, gen_type; struct perf_event *bp; data &= ~DR_CONTROL_RESERVED; @@ -634,33 +661,12 @@ restore: continue; } - /* - * We shoud have at least an inactive breakpoint at this - * slot. It means the user is writing dr7 without having - * written the address register first - */ - if (!bp) { - rc = -EINVAL; - break; - } - - rc = arch_bp_generic_fields(len, type, &gen_len, &gen_type); - if (rc) - break; - - /* - * This is a temporary thing as bp is unregistered/registered - * to simulate modification - */ - bp = modify_user_hw_breakpoint(bp, bp->attr.bp_addr, gen_len, - gen_type, bp->callback, - tsk, true); - thread->ptrace_bps[i] = NULL; + bp = ptrace_modify_breakpoint(bp, len, type, tsk); /* Incorrect bp, or we have a bug in bp API */ if (IS_ERR(bp)) { rc = PTR_ERR(bp); - bp = NULL; + thread->ptrace_bps[i] = NULL; break; } thread->ptrace_bps[i] = bp; @@ -707,24 +713,26 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, { struct perf_event *bp; struct thread_struct *t = &tsk->thread; + DEFINE_BREAKPOINT_ATTR(attr); if (!t->ptrace_bps[nr]) { /* * Put stub len and type to register (reserve) an inactive but * correct bp */ - bp = register_user_hw_breakpoint(addr, HW_BREAKPOINT_LEN_1, - HW_BREAKPOINT_W, - ptrace_triggered, tsk, - false); + attr.bp_addr = addr; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + + bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); } else { bp = t->ptrace_bps[nr]; t->ptrace_bps[nr] = NULL; - bp = modify_user_hw_breakpoint(bp, addr, bp->attr.bp_len, - bp->attr.bp_type, - bp->callback, - tsk, - bp->attr.disabled); + + attr = bp->attr; + attr.bp_addr = addr; + bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); } /* * CHECKME: the previous code returned -EIO if the addr wasn't a -- cgit v1.2.1 From 6a9401a7ac13e62ef2baf4d46e022d303edc3050 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 13:22:21 +0100 Subject: x86/amd-iommu: Separate internal interface definitions This patch moves all function declarations which are only used inside the driver code to a seperate header file. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 1 + arch/x86/kernel/amd_iommu_init.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b74b21247584..50d2b05a458b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 72bdbdac9b48..db30cfe86fce 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.1 From bf3118c1276d27fe9e84aa42382da25ee0750777 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 13:39:19 +0100 Subject: x86/amd-iommu: Update copyright headers This patch updates the copyright headers in the relevant AMD IOMMU driver files to match the date of the latest changes. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 50d2b05a458b..7fe28be3b548 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel * Leo Duran * diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index db30cfe86fce..cee11424d412 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. * Author: Joerg Roedel * Leo Duran * -- cgit v1.2.1 From bb52777ec4d736c2d7c4f037b32d4eeeb172ed89 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 14:31:51 +0100 Subject: x86/amd-iommu: Add an index field to struct amd_iommu This patch adds an index field to struct amd_iommu which can be used to lookup it up in an array. This index will be used in struct protection_domain to keep track which protection domain has devices behind which IOMMU. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index cee11424d412..8567d1698027 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -137,6 +137,10 @@ bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ +/* Array to assign indices to IOMMUs*/ +struct amd_iommu *amd_iommus[MAX_IOMMUS]; +int amd_iommus_present; + /* * Pointer to the device table which is shared by all AMD IOMMUs * it is indexed by the PCI device id or the HT unit id and contains @@ -840,7 +844,18 @@ static void __init free_iommu_all(void) static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { spin_lock_init(&iommu->lock); + + /* Add IOMMU to internal data structures */ list_add_tail(&iommu->list, &amd_iommu_list); + iommu->index = amd_iommus_present++; + + if (unlikely(iommu->index >= MAX_IOMMUS)) { + WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n"); + return -ENOSYS; + } + + /* Index is fine - add IOMMU to the array */ + amd_iommus[iommu->index] = iommu; /* * Copy data from ACPI table entry to the iommu struct -- cgit v1.2.1 From c459611424d8b8396060eb766e23bd0c70c993bc Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 14:57:32 +0100 Subject: x86/amd-iommu: Add per IOMMU reference counting This patch adds reference counting for protection domains per IOMMU. This allows a smarter TLB flushing strategy. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 7fe28be3b548..8c38f0085403 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1175,7 +1175,9 @@ static void __attach_device(struct amd_iommu *iommu, /* update DTE entry */ set_dte_entry(devid, domain); - domain->dev_cnt += 1; + /* Do reference counting */ + domain->dev_iommu[iommu->index] += 1; + domain->dev_cnt += 1; /* ready */ spin_unlock(&domain->lock); @@ -1209,6 +1211,9 @@ static void attach_device(struct amd_iommu *iommu, */ static void __detach_device(struct protection_domain *domain, u16 devid) { + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + + BUG_ON(!iommu); /* lock domain */ spin_lock(&domain->lock); @@ -1223,8 +1228,9 @@ static void __detach_device(struct protection_domain *domain, u16 devid) amd_iommu_apply_erratum_63(devid); - /* decrease reference counter */ - domain->dev_cnt -= 1; + /* decrease reference counters */ + domain->dev_iommu[iommu->index] -= 1; + domain->dev_cnt -= 1; /* ready */ spin_unlock(&domain->lock); -- cgit v1.2.1 From 0518a3a4585cb3eeeaf14ca57131f11d252130c6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 16:00:05 +0100 Subject: x86/amd-iommu: Add function to complete a tlb flush This patch adds a function to the AMD IOMMU driver which completes all queued commands an all IOMMUs a specific domain has devices attached on. This is required in a later patch when per-domain flushing is implemented. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8c38f0085403..8fa5cc3e02d2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -376,6 +376,22 @@ out: return 0; } +static void iommu_flush_complete(struct protection_domain *domain) +{ + int i; + + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + iommu_completion_wait(amd_iommus[i]); + } +} + /* * Command send function for invalidating a device table entry */ @@ -1758,7 +1774,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, if (addr == DMA_ERROR_CODE) goto out; - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1791,7 +1807,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, __unmap_single(iommu, domain->priv, dma_addr, size, dir); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1863,7 +1879,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, goto unmap; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -1914,7 +1930,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, s->dma_address = s->dma_length = 0; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -1969,7 +1985,7 @@ static void *alloc_coherent(struct device *dev, size_t size, goto out_free; } - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2010,7 +2026,7 @@ static void free_coherent(struct device *dev, size_t size, __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_completion_wait(iommu); + iommu_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); -- cgit v1.2.1 From 6de8ad9b9ee0ec5b52ec8ec41401833e5e89186f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 18:30:32 +0100 Subject: x86/amd-iommu: Make iommu_flush_pages aware of multiple IOMMUs This patch extends the iommu_flush_pages function to flush the TLB entries on all IOMMUs the domain has devices on. This basically gives up the former assumption that dma_ops domains are only bound to one IOMMU in the system. For dma_ops domains this is still true but not for IOMMU-API managed domains. Giving this assumption up for dma_ops domains too allows code simplification. Further it splits out the main logic into a generic function which can be used by iommu_flush_tlb too. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 8fa5cc3e02d2..7c06e574008f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -447,10 +447,10 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, * It invalidates a single PTE if the range to flush is within a single * page. Otherwise it flushes the whole TLB of the IOMMU. */ -static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, - u64 address, size_t size) +static void __iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size, int pde) { - int s = 0; + int s = 0, i; unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -464,9 +464,26 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, s = 1; } - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s); - return 0; + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need a TLB flush + */ + iommu_queue_inv_iommu_pages(amd_iommus[i], address, + domain->id, pde, s); + } + + return; +} + +static void iommu_flush_pages(struct protection_domain *domain, + u64 address, size_t size) +{ + __iommu_flush_pages(domain, address, size, 0); } /* Flush the whole IO/TLB for a given protection domain */ @@ -1683,7 +1700,7 @@ retry: iommu_flush_tlb(iommu, dma_dom->domain.id); dma_dom->need_flush = false; } else if (unlikely(iommu_has_npcache(iommu))) - iommu_flush_pages(iommu, dma_dom->domain.id, address, size); + iommu_flush_pages(&dma_dom->domain, address, size); out: return address; @@ -1731,7 +1748,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); + iommu_flush_pages(&dma_dom->domain, dma_addr, size); dma_dom->need_flush = false; } } -- cgit v1.2.1 From dcd1e92e405449ecc5e8bd8fcfebf3b2a13d3d37 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 15:30:58 +0100 Subject: x86/amd-iommu: Use __iommu_flush_pages for tlb flushes This patch re-implements iommu_flush_tlb functions to use the __iommu_flush_pages logic. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 7c06e574008f..c55aa079ded3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -451,7 +451,7 @@ static void __iommu_flush_pages(struct protection_domain *domain, u64 address, size_t size, int pde) { int s = 0, i; - unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); + unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -487,23 +487,15 @@ static void iommu_flush_pages(struct protection_domain *domain, } /* Flush the whole IO/TLB for a given protection domain */ -static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_tlb(struct protection_domain *domain) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); - - iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) +static void iommu_flush_tlb_pde(struct protection_domain *domain) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - - INC_STATS_COUNTER(domain_flush_single); - - iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1); + __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } /* @@ -1236,7 +1228,7 @@ static void attach_device(struct amd_iommu *iommu, * here to evict all dirty stuff. */ iommu_queue_inv_dev_entry(iommu, devid); - iommu_flush_tlb_pde(iommu, domain->id); + iommu_flush_tlb_pde(domain); } /* @@ -1697,7 +1689,7 @@ retry: ADD_STATS_COUNTER(alloced_io_mem, size); if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { - iommu_flush_tlb(iommu, dma_dom->domain.id); + iommu_flush_tlb(&dma_dom->domain); dma_dom->need_flush = false; } else if (unlikely(iommu_has_npcache(iommu))) iommu_flush_pages(&dma_dom->domain, address, size); -- cgit v1.2.1 From 601367d76bd19b7eea2286ae99e5b1cb5d74f38d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 16:08:55 +0100 Subject: x86/amd-iommu: Remove iommu_flush_domain function This iommu_flush_tlb_pde function does essentially the same. So the iommu_flush_domain function is redundant and can be removed. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c55aa079ded3..b2c19f41f238 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -528,20 +528,6 @@ static void flush_all_domains_on_iommu(struct amd_iommu *iommu) } -/* - * This function is used to flush the IO/TLB for a given protection domain - * on every IOMMU in the system - */ -static void iommu_flush_domain(u16 domid) -{ - struct amd_iommu *iommu; - - INC_STATS_COUNTER(domain_flush_all); - - for_each_iommu(iommu) - flush_domain_on_iommu(iommu, domid); -} - void amd_iommu_flush_all_domains(void) { struct amd_iommu *iommu; @@ -1464,7 +1450,7 @@ static void update_domain(struct protection_domain *domain) update_device_table(domain); flush_devices_by_domain(domain); - iommu_flush_domain(domain->id); + iommu_flush_tlb_pde(domain); domain->updated = false; } @@ -2377,7 +2363,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova += PAGE_SIZE; } - iommu_flush_domain(domain->id); + iommu_flush_tlb_pde(domain); } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, -- cgit v1.2.1 From aeb26f55337d4310840c8adc3ec7d6aebb714472 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 16:44:01 +0100 Subject: x86/amd-iommu: Implement protection domain list This patch adds code to keep a global list of all protection domains. This allows to simplify the resume code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 33 +++++++++++++++++++++++++++++++++ arch/x86/kernel/amd_iommu_init.c | 8 ++++++++ 2 files changed, 41 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b2c19f41f238..0c4319b13014 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -985,6 +985,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, * ****************************************************************************/ +/* + * This function adds a protection domain to the global protection domain list + */ +static void add_domain_to_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_add(&domain->list, &amd_iommu_pd_list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + +/* + * This function removes a protection domain to the global + * protection domain list + */ +static void del_domain_from_list(struct protection_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); + list_del(&domain->list); + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); +} + static u16 domain_id_alloc(void) { unsigned long flags; @@ -1073,6 +1098,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) if (!dom) return; + del_domain_from_list(&dom->domain); + free_pagetable(&dom->domain); for (i = 0; i < APERTURE_MAX_RANGES; ++i) { @@ -1113,6 +1140,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) dma_dom->need_flush = false; dma_dom->target_dev = 0xffff; + add_domain_to_list(&dma_dom->domain); + if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) goto free_dma_dom; @@ -2188,6 +2217,8 @@ static void protection_domain_free(struct protection_domain *domain) if (!domain) return; + del_domain_from_list(domain); + if (domain->id) domain_id_free(domain->id); @@ -2207,6 +2238,8 @@ static struct protection_domain *protection_domain_alloc(void) if (!domain->id) goto out_err; + add_domain_to_list(domain); + return domain; out_err: diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8567d1698027..73d5173765d2 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -141,6 +141,12 @@ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the struct amd_iommu *amd_iommus[MAX_IOMMUS]; int amd_iommus_present; +/* + * List of protection domains - used during resume + */ +LIST_HEAD(amd_iommu_pd_list); +spinlock_t amd_iommu_pd_lock; + /* * Pointer to the device table which is shared by all AMD IOMMUs * it is indexed by the PCI device id or the HT unit id and contains @@ -1263,6 +1269,8 @@ static int __init amd_iommu_init(void) */ amd_iommu_pd_alloc_bitmap[0] = 1; + spin_lock_init(&amd_iommu_pd_lock); + /* * now the data structures are allocated and basically initialized * start the real acpi table scan -- cgit v1.2.1 From e3306664eb307ae4cc93211cd9f12d0dbd49de65 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 16:48:58 +0100 Subject: x86/amd-iommu: Reimplement amd_iommu_flush_all_domains() This patch reimplementes the amd_iommu_flush_all_domains function to use the global protection domain list instead of flushing every domain on every IOMMU. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0c4319b13014..5141f5608c5c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -530,10 +530,12 @@ static void flush_all_domains_on_iommu(struct amd_iommu *iommu) void amd_iommu_flush_all_domains(void) { - struct amd_iommu *iommu; + struct protection_domain *domain; - for_each_iommu(iommu) - flush_all_domains_on_iommu(iommu); + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + iommu_flush_tlb_pde(domain); + iommu_flush_complete(domain); + } } static void flush_all_devices_for_iommu(struct amd_iommu *iommu) -- cgit v1.2.1 From 09b4280439ef6fdc55f1353a9135034336eb5d26 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 20 Nov 2009 17:02:44 +0100 Subject: x86/amd-iommu: Reimplement flush_all_domains_on_iommu() This patch reimplements the function flush_all_domains_on_iommu to use the global protection domain list. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 5141f5608c5c..a1bd99d390ab 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -499,43 +499,48 @@ static void iommu_flush_tlb_pde(struct protection_domain *domain) } /* - * This function flushes one domain on one IOMMU + * This function flushes all domains that have devices on the given IOMMU */ -static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) +static void flush_all_domains_on_iommu(struct amd_iommu *iommu) { - struct iommu_cmd cmd; + u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + struct protection_domain *domain; unsigned long flags; - __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, - domid, 1, 1); - - spin_lock_irqsave(&iommu->lock, flags); - __iommu_queue_command(iommu, &cmd); - __iommu_completion_wait(iommu); - __iommu_wait_for_completion(iommu); - spin_unlock_irqrestore(&iommu->lock, flags); -} - -static void flush_all_domains_on_iommu(struct amd_iommu *iommu) -{ - int i; + spin_lock_irqsave(&amd_iommu_pd_lock, flags); - for (i = 1; i < MAX_DOMAIN_ID; ++i) { - if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) + list_for_each_entry(domain, &amd_iommu_pd_list, list) { + if (domain->dev_iommu[iommu->index] == 0) continue; - flush_domain_on_iommu(iommu, i); + + spin_lock(&domain->lock); + iommu_queue_inv_iommu_pages(iommu, address, domain->id, 1, 1); + iommu_flush_complete(domain); + spin_unlock(&domain->lock); } + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } +/* + * This function uses heavy locking and may disable irqs for some time. But + * this is no issue because it is only called during resume. + */ void amd_iommu_flush_all_domains(void) { struct protection_domain *domain; + unsigned long flags; + + spin_lock_irqsave(&amd_iommu_pd_lock, flags); list_for_each_entry(domain, &amd_iommu_pd_list, list) { + spin_lock(&domain->lock); iommu_flush_tlb_pde(domain); iommu_flush_complete(domain); + spin_unlock(&domain->lock); } + + spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } static void flush_all_devices_for_iommu(struct amd_iommu *iommu) -- cgit v1.2.1 From 318afd41d2eca3224de3fd85a3b9a27a3010a98d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 18:32:38 +0100 Subject: x86/amd-iommu: Make np-cache a global flag The non-present cache flag was IOMMU local until now which doesn't make sense. Make this a global flag so we can remove the lase user of 'struct iommu' in the map/unmap path. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 8 +------- arch/x86/kernel/amd_iommu_init.c | 6 ++++++ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a1bd99d390ab..5ebd24e4fc57 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -131,12 +131,6 @@ static void amd_iommu_stats_init(void) #endif -/* returns !0 if the IOMMU is caching non-present entries in its TLB */ -static int iommu_has_npcache(struct amd_iommu *iommu) -{ - return iommu->cap & (1UL << IOMMU_CAP_NPCACHE); -} - /**************************************************************************** * * Interrupt handling functions @@ -1713,7 +1707,7 @@ retry: if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { iommu_flush_tlb(&dma_dom->domain); dma_dom->need_flush = false; - } else if (unlikely(iommu_has_npcache(iommu))) + } else if (unlikely(amd_iommu_np_cache)) iommu_flush_pages(&dma_dom->domain, address, size); out: diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 73d5173765d2..fbe4c3c02a91 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -141,6 +141,9 @@ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the struct amd_iommu *amd_iommus[MAX_IOMMUS]; int amd_iommus_present; +/* IOMMUs have a non-present cache? */ +bool amd_iommu_np_cache __read_mostly; + /* * List of protection domains - used during resume */ @@ -891,6 +894,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); + if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) + amd_iommu_np_cache = true; + return pci_enable_device(iommu->dev); } -- cgit v1.2.1 From 420aef8a3acfc3e75427107e23d5a9bafd17c477 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 16:14:57 +0100 Subject: x86/amd-iommu: Use check_device for amd_iommu_dma_supported The check_device logic needs to include the dma_supported checks to be really sure. Merge the dma_supported logic into check_device and use it to implement dma_supported. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 5ebd24e4fc57..ac27b1d6bd12 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1375,9 +1375,27 @@ static struct notifier_block device_nb = { */ static bool check_device(struct device *dev) { + u16 bdf; + struct pci_dev *pcidev; + if (!dev || !dev->dma_mask) return false; + /* No device or no PCI device */ + if (!dev || dev->bus != &pci_bus_type) + return false; + + pcidev = to_pci_dev(dev); + + bdf = calc_devid(pcidev->bus->number, pcidev->devfn); + + /* Out of our scope? */ + if (bdf > amd_iommu_last_bdf) + return false; + + if (amd_iommu_rlookup_table[bdf] == NULL) + return false; + return true; } @@ -2065,22 +2083,7 @@ free_mem: */ static int amd_iommu_dma_supported(struct device *dev, u64 mask) { - u16 bdf; - struct pci_dev *pcidev; - - /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) - return 0; - - pcidev = to_pci_dev(dev); - - bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* Out of our scope? */ - if (bdf > amd_iommu_last_bdf) - return 0; - - return 1; + return check_device(dev); } /* -- cgit v1.2.1 From f99c0f1c75f75924a6f19cb40a21ccefc6e8754d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 16:52:56 +0100 Subject: x86/amd-iommu: Use check_device in get_device_resources Every call-place of get_device_resources calls check_device before it. So call it from get_device_resources directly and simplify the code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 86 +++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 58 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index ac27b1d6bd12..c5102ebdcbd9 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1432,35 +1432,24 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) * If the device is not yet associated with a domain this is also done * in this function. */ -static int get_device_resources(struct device *dev, - struct amd_iommu **iommu, - struct protection_domain **domain, - u16 *bdf) +static bool get_device_resources(struct device *dev, + struct amd_iommu **iommu, + struct protection_domain **domain, + u16 *bdf) { struct dma_ops_domain *dma_dom; struct pci_dev *pcidev; u16 _bdf; - *iommu = NULL; - *domain = NULL; - *bdf = 0xffff; - - if (dev->bus != &pci_bus_type) - return 0; - - pcidev = to_pci_dev(dev); - _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* device not translated by any IOMMU in the system? */ - if (_bdf > amd_iommu_last_bdf) - return 0; - - *bdf = amd_iommu_alias_table[_bdf]; + if (!check_device(dev)) + return false; - *iommu = amd_iommu_rlookup_table[*bdf]; - if (*iommu == NULL) - return 0; + pcidev = to_pci_dev(dev); + _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); + *bdf = amd_iommu_alias_table[_bdf]; + *iommu = amd_iommu_rlookup_table[*bdf]; *domain = domain_for_device(*bdf); + if (*domain == NULL) { dma_dom = find_protection_domain(*bdf); if (!dma_dom) @@ -1474,7 +1463,7 @@ static int get_device_resources(struct device *dev, if (domain_for_device(_bdf) == NULL) attach_device(*iommu, *domain, _bdf); - return 1; + return true; } static void update_device_table(struct protection_domain *domain) @@ -1797,17 +1786,12 @@ static dma_addr_t map_page(struct device *dev, struct page *page, INC_STATS_COUNTER(cnt_map_single); - if (!check_device(dev)) - return DMA_ERROR_CODE; - - dma_mask = *dev->dma_mask; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (iommu == NULL || domain == NULL) + if (!get_device_resources(dev, &iommu, &domain, &devid)) /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; + dma_mask = *dev->dma_mask; + if (!dma_ops_domain(domain)) return DMA_ERROR_CODE; @@ -1838,8 +1822,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, INC_STATS_COUNTER(cnt_unmap_single); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &iommu, &domain, &devid)) /* device not handled by any AMD IOMMU */ return; @@ -1893,16 +1876,11 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_map_sg); - if (!check_device(dev)) - return 0; + if (!get_device_resources(dev, &iommu, &domain, &devid)) + return map_sg_no_iommu(dev, sglist, nelems, dir); dma_mask = *dev->dma_mask; - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) - return map_sg_no_iommu(dev, sglist, nelems, dir); - if (!dma_ops_domain(domain)) return 0; @@ -1958,8 +1936,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_unmap_sg); - if (!check_device(dev) || - !get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &iommu, &domain, &devid)) return; if (!dma_ops_domain(domain)) @@ -1994,24 +1971,22 @@ static void *alloc_coherent(struct device *dev, size_t size, INC_STATS_COUNTER(cnt_alloc_coherent); - if (!check_device(dev)) - return NULL; + if (!get_device_resources(dev, &iommu, &domain, &devid)) { + virt_addr = (void *)__get_free_pages(flag, get_order(size)); + *dma_addr = __pa(virt_addr); + return virt_addr; + } - if (!get_device_resources(dev, &iommu, &domain, &devid)) - flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + dma_mask = dev->coherent_dma_mask; + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + flag |= __GFP_ZERO; - flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) return NULL; paddr = virt_to_phys(virt_addr); - if (!iommu || !domain) { - *dma_addr = (dma_addr_t)paddr; - return virt_addr; - } - if (!dma_ops_domain(domain)) goto out_free; @@ -2054,12 +2029,7 @@ static void free_coherent(struct device *dev, size_t size, INC_STATS_COUNTER(cnt_free_coherent); - if (!check_device(dev)) - return; - - get_device_resources(dev, &iommu, &domain, &devid); - - if (!iommu || !domain) + if (!get_device_resources(dev, &iommu, &domain, &devid)) goto free_mem; if (!dma_ops_domain(domain)) -- cgit v1.2.1 From 680525e06ddccda8c51bdddf532cd5b7d950c411 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 18:44:42 +0100 Subject: x86/amd-iommu: Remove iommu parameter from dma_ops_domain_(un)map The parameter is unused in these function so remove it from the parameter list. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c5102ebdcbd9..da3f9d8ee395 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1585,8 +1585,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, * This is the generic map function. It maps one 4kb page at paddr to * the given address in the DMA address space for the domain. */ -static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom, unsigned long address, phys_addr_t paddr, int direction) @@ -1620,8 +1619,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, /* * The generic unmapping function for on page in the DMA address space. */ -static void dma_ops_domain_unmap(struct amd_iommu *iommu, - struct dma_ops_domain *dom, +static void dma_ops_domain_unmap(struct dma_ops_domain *dom, unsigned long address) { struct aperture_range *aperture; @@ -1700,7 +1698,7 @@ retry: start = address; for (i = 0; i < pages; ++i) { - ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + ret = dma_ops_domain_map(dma_dom, start, paddr, dir); if (ret == DMA_ERROR_CODE) goto out_unmap; @@ -1724,7 +1722,7 @@ out_unmap: for (--i; i >= 0; --i) { start -= PAGE_SIZE; - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); } dma_ops_free_addresses(dma_dom, address, pages); @@ -1754,7 +1752,7 @@ static void __unmap_single(struct amd_iommu *iommu, start = dma_addr; for (i = 0; i < pages; ++i) { - dma_ops_domain_unmap(iommu, dma_dom, start); + dma_ops_domain_unmap(dma_dom, start); start += PAGE_SIZE; } -- cgit v1.2.1 From 576175c2503ae9b0f930ee9a6a0abaf7ef8956ad Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 19:08:46 +0100 Subject: x86/amd-iommu: Make alloc_new_range aware of multiple IOMMUs Since the assumption that an dma_ops domain is only bound to one IOMMU was given up we need to make alloc_new_range aware of it. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index da3f9d8ee395..687f617b95d7 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -788,11 +788,11 @@ static u64 *fetch_pte(struct protection_domain *domain, * aperture in case of dma_ops domain allocation or address allocation * failure. */ -static int alloc_new_range(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static int alloc_new_range(struct dma_ops_domain *dma_dom, bool populate, gfp_t gfp) { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; + struct amd_iommu *iommu; int i; #ifdef CONFIG_IOMMU_STRESS @@ -832,14 +832,17 @@ static int alloc_new_range(struct amd_iommu *iommu, dma_dom->aperture_size += APERTURE_RANGE_SIZE; /* Intialize the exclusion range if necessary */ - if (iommu->exclusion_start && - iommu->exclusion_start >= dma_dom->aperture[index]->offset && - iommu->exclusion_start < dma_dom->aperture_size) { - unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, - iommu->exclusion_length, - PAGE_SIZE); - dma_ops_reserve_addresses(dma_dom, startpage, pages); + for_each_iommu(iommu) { + if (iommu->exclusion_start && + iommu->exclusion_start >= dma_dom->aperture[index]->offset + && iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage; + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); + startpage = iommu->exclusion_start >> PAGE_SHIFT; + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } } /* @@ -1143,7 +1146,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) add_domain_to_list(&dma_dom->domain); - if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) + if (alloc_new_range(dma_dom, true, GFP_KERNEL)) goto free_dma_dom; /* @@ -1686,7 +1689,7 @@ retry: */ dma_dom->next_address = dma_dom->aperture_size; - if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) + if (alloc_new_range(dma_dom, false, GFP_ATOMIC)) goto out; /* -- cgit v1.2.1 From cd8c82e875c27ee0d8b59fb76bc12aa9db6a70c2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 19:33:56 +0100 Subject: x86/amd-iommu: Remove iommu parameter from __(un)map_single With the prior changes this parameter is not longer required. This patch removes it from the function and all callers. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 687f617b95d7..c04dcb7f40b2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1653,7 +1653,6 @@ static void dma_ops_domain_unmap(struct dma_ops_domain *dom, * Must be called with the domain lock held. */ static dma_addr_t __map_single(struct device *dev, - struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, phys_addr_t paddr, size_t size, @@ -1737,8 +1736,7 @@ out_unmap: * Does the reverse of the __map_single function. Must be called with * the domain lock held too */ -static void __unmap_single(struct amd_iommu *iommu, - struct dma_ops_domain *dma_dom, +static void __unmap_single(struct dma_ops_domain *dma_dom, dma_addr_t dma_addr, size_t size, int dir) @@ -1797,7 +1795,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, return DMA_ERROR_CODE; spin_lock_irqsave(&domain->lock, flags); - addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + addr = __map_single(dev, domain->priv, paddr, size, dir, false, dma_mask); if (addr == DMA_ERROR_CODE) goto out; @@ -1832,7 +1830,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, dir); + __unmap_single(domain->priv, dma_addr, size, dir); iommu_flush_complete(domain); @@ -1890,7 +1888,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, for_each_sg(sglist, s, nelems, i) { paddr = sg_phys(s); - s->dma_address = __map_single(dev, iommu, domain->priv, + s->dma_address = __map_single(dev, domain->priv, paddr, s->length, dir, false, dma_mask); @@ -1910,7 +1908,7 @@ out: unmap: for_each_sg(sglist, s, mapped_elems, i) { if (s->dma_address) - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } @@ -1946,7 +1944,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { - __unmap_single(iommu, domain->priv, s->dma_address, + __unmap_single(domain->priv, s->dma_address, s->dma_length, dir); s->dma_address = s->dma_length = 0; } @@ -1996,7 +1994,7 @@ static void *alloc_coherent(struct device *dev, size_t size, spin_lock_irqsave(&domain->lock, flags); - *dma_addr = __map_single(dev, iommu, domain->priv, paddr, + *dma_addr = __map_single(dev, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); if (*dma_addr == DMA_ERROR_CODE) { @@ -2038,7 +2036,7 @@ static void free_coherent(struct device *dev, size_t size, spin_lock_irqsave(&domain->lock, flags); - __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); + __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); iommu_flush_complete(domain); -- cgit v1.2.1 From f3be07da531ceef1b51295e5becc9bc07670b671 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 19:43:14 +0100 Subject: x86/amd-iommu: Remove iommu specific handling from dma_ops path This patch finishes the removal of all iommu specific handling code in the dma_ops path. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c04dcb7f40b2..2cd5800e6888 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1436,11 +1436,11 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) * in this function. */ static bool get_device_resources(struct device *dev, - struct amd_iommu **iommu, struct protection_domain **domain, u16 *bdf) { struct dma_ops_domain *dma_dom; + struct amd_iommu *iommu; struct pci_dev *pcidev; u16 _bdf; @@ -1450,21 +1450,21 @@ static bool get_device_resources(struct device *dev, pcidev = to_pci_dev(dev); _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); *bdf = amd_iommu_alias_table[_bdf]; - *iommu = amd_iommu_rlookup_table[*bdf]; + iommu = amd_iommu_rlookup_table[*bdf]; *domain = domain_for_device(*bdf); if (*domain == NULL) { dma_dom = find_protection_domain(*bdf); if (!dma_dom) - dma_dom = (*iommu)->default_dom; + dma_dom = iommu->default_dom; *domain = &dma_dom->domain; - attach_device(*iommu, *domain, *bdf); + attach_device(iommu, *domain, *bdf); DUMP_printk("Using protection domain %d for device %s\n", (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) - attach_device(*iommu, *domain, _bdf); + attach_device(iommu, *domain, _bdf); return true; } @@ -1776,7 +1776,6 @@ static dma_addr_t map_page(struct device *dev, struct page *page, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; u16 devid; dma_addr_t addr; @@ -1785,7 +1784,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, INC_STATS_COUNTER(cnt_map_single); - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &domain, &devid)) /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; @@ -1815,13 +1814,12 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; u16 devid; INC_STATS_COUNTER(cnt_unmap_single); - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &domain, &devid)) /* device not handled by any AMD IOMMU */ return; @@ -1864,7 +1862,6 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; u16 devid; int i; @@ -1875,7 +1872,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_map_sg); - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &domain, &devid)) return map_sg_no_iommu(dev, sglist, nelems, dir); dma_mask = *dev->dma_mask; @@ -1927,7 +1924,6 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, struct dma_attrs *attrs) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; struct scatterlist *s; u16 devid; @@ -1935,7 +1931,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_unmap_sg); - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &domain, &devid)) return; if (!dma_ops_domain(domain)) @@ -1962,7 +1958,6 @@ static void *alloc_coherent(struct device *dev, size_t size, { unsigned long flags; void *virt_addr; - struct amd_iommu *iommu; struct protection_domain *domain; u16 devid; phys_addr_t paddr; @@ -1970,7 +1965,7 @@ static void *alloc_coherent(struct device *dev, size_t size, INC_STATS_COUNTER(cnt_alloc_coherent); - if (!get_device_resources(dev, &iommu, &domain, &devid)) { + if (!get_device_resources(dev, &domain, &devid)) { virt_addr = (void *)__get_free_pages(flag, get_order(size)); *dma_addr = __pa(virt_addr); return virt_addr; @@ -2022,13 +2017,12 @@ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) { unsigned long flags; - struct amd_iommu *iommu; struct protection_domain *domain; u16 devid; INC_STATS_COUNTER(cnt_free_coherent); - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!get_device_resources(dev, &domain, &devid)) goto free_mem; if (!dma_ops_domain(domain)) -- cgit v1.2.1 From 15898bbcb48fc86c2baff156163df0941ecb6a15 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 15:39:42 +0100 Subject: x86/amd-iommu: Let domain_for_device handle aliases If there is no domain associated to a device yet and the device has an alias device which already has a domain, the original device needs to have the same domain as the alias device. This patch changes domain_for_device to handle this situation and directly assigns the alias device domain to the device in this situation. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 227 ++++++++++++++++++++++++++------------------ 1 file changed, 135 insertions(+), 92 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 2cd5800e6888..75470ffee358 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -71,6 +71,19 @@ static u64 *fetch_pte(struct protection_domain *domain, unsigned long address, int map_size); static void update_domain(struct protection_domain *domain); +/**************************************************************************** + * + * Helper functions + * + ****************************************************************************/ + +static inline u16 get_device_id(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + + return calc_devid(pdev->bus->number, pdev->devfn); +} + #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1174,26 +1187,13 @@ static bool dma_ops_domain(struct protection_domain *domain) return domain->flags & PD_DMA_OPS_MASK; } -/* - * Find out the protection domain structure for a given PCI device. This - * will give us the pointer to the page table root for example. - */ -static struct protection_domain *domain_for_device(u16 devid) -{ - struct protection_domain *dom; - unsigned long flags; - - read_lock_irqsave(&amd_iommu_devtable_lock, flags); - dom = amd_iommu_pd_table[devid]; - read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - - return dom; -} - static void set_dte_entry(u16 devid, struct protection_domain *domain) { + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; u64 pte_root = virt_to_phys(domain->pt_root); + BUG_ON(amd_iommu_pd_table[devid] != NULL); + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; @@ -1203,42 +1203,87 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); amd_iommu_pd_table[devid] = domain; + + /* Do reference counting */ + domain->dev_iommu[iommu->index] += 1; + domain->dev_cnt += 1; + + /* Flush the changes DTE entry */ + iommu_queue_inv_dev_entry(iommu, devid); +} + +static void clear_dte_entry(u16 devid) +{ + struct protection_domain *domain = amd_iommu_pd_table[devid]; + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + + BUG_ON(domain == NULL); + + /* remove domain from the lookup table */ + amd_iommu_pd_table[devid] = NULL; + + /* remove entry from the device table seen by the hardware */ + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[2] = 0; + + amd_iommu_apply_erratum_63(devid); + + /* decrease reference counters */ + domain->dev_iommu[iommu->index] -= 1; + domain->dev_cnt -= 1; + + iommu_queue_inv_dev_entry(iommu, devid); } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void __attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int __attach_device(struct device *dev, + struct protection_domain *domain) { + u16 devid = get_device_id(dev); + u16 alias = amd_iommu_alias_table[devid]; + /* lock domain */ spin_lock(&domain->lock); - /* update DTE entry */ - set_dte_entry(devid, domain); + /* Some sanity checks */ + if (amd_iommu_pd_table[alias] != NULL && + amd_iommu_pd_table[alias] != domain) + return -EBUSY; - /* Do reference counting */ - domain->dev_iommu[iommu->index] += 1; - domain->dev_cnt += 1; + if (amd_iommu_pd_table[devid] != NULL && + amd_iommu_pd_table[devid] != domain) + return -EBUSY; + + /* Do real assignment */ + if (alias != devid && + amd_iommu_pd_table[alias] == NULL) + set_dte_entry(alias, domain); + + if (amd_iommu_pd_table[devid] == NULL) + set_dte_entry(devid, domain); /* ready */ spin_unlock(&domain->lock); + + return 0; } /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void attach_device(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static int attach_device(struct device *dev, + struct protection_domain *domain) { unsigned long flags; + int ret; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __attach_device(iommu, domain, devid); + ret = __attach_device(dev, domain); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); /* @@ -1246,62 +1291,70 @@ static void attach_device(struct amd_iommu *iommu, * left the caches in the IOMMU dirty. So we have to flush * here to evict all dirty stuff. */ - iommu_queue_inv_dev_entry(iommu, devid); iommu_flush_tlb_pde(domain); + + return ret; } /* * Removes a device from a protection domain (unlocked) */ -static void __detach_device(struct protection_domain *domain, u16 devid) +static void __detach_device(struct device *dev) { + u16 devid = get_device_id(dev); struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; BUG_ON(!iommu); - /* lock domain */ - spin_lock(&domain->lock); - - /* remove domain from the lookup table */ - amd_iommu_pd_table[devid] = NULL; - - /* remove entry from the device table seen by the hardware */ - amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[1] = 0; - amd_iommu_dev_table[devid].data[2] = 0; - - amd_iommu_apply_erratum_63(devid); - - /* decrease reference counters */ - domain->dev_iommu[iommu->index] -= 1; - domain->dev_cnt -= 1; - - /* ready */ - spin_unlock(&domain->lock); + clear_dte_entry(devid); /* * If we run in passthrough mode the device must be assigned to the * passthrough domain if it is detached from any other domain */ - if (iommu_pass_through) { - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; - __attach_device(iommu, pt_domain, devid); - } + if (iommu_pass_through) + __attach_device(dev, pt_domain); } /* * Removes a device from a protection domain (with devtable_lock held) */ -static void detach_device(struct protection_domain *domain, u16 devid) +static void detach_device(struct device *dev) { unsigned long flags; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); - __detach_device(domain, devid); + __detach_device(dev); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ +static struct protection_domain *domain_for_device(struct device *dev) +{ + struct protection_domain *dom; + unsigned long flags; + u16 devid, alias; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + + read_lock_irqsave(&amd_iommu_devtable_lock, flags); + dom = amd_iommu_pd_table[devid]; + if (dom == NULL && + amd_iommu_pd_table[alias] != NULL) { + __attach_device(dev, amd_iommu_pd_table[alias]); + dom = amd_iommu_pd_table[devid]; + } + + read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + return dom; +} + static int device_change_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -1322,7 +1375,7 @@ static int device_change_notifier(struct notifier_block *nb, if (iommu == NULL) goto out; - domain = domain_for_device(devid); + domain = domain_for_device(dev); if (domain && !dma_ops_domain(domain)) WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " @@ -1334,7 +1387,7 @@ static int device_change_notifier(struct notifier_block *nb, goto out; if (iommu_pass_through) break; - detach_device(domain, devid); + detach_device(dev); break; case BUS_NOTIFY_ADD_DEVICE: /* allocate a protection domain if a device is added */ @@ -1441,30 +1494,25 @@ static bool get_device_resources(struct device *dev, { struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - struct pci_dev *pcidev; - u16 _bdf; if (!check_device(dev)) return false; - pcidev = to_pci_dev(dev); - _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - *bdf = amd_iommu_alias_table[_bdf]; + *bdf = get_device_id(dev); + *domain = domain_for_device(dev); iommu = amd_iommu_rlookup_table[*bdf]; - *domain = domain_for_device(*bdf); - if (*domain == NULL) { - dma_dom = find_protection_domain(*bdf); - if (!dma_dom) - dma_dom = iommu->default_dom; - *domain = &dma_dom->domain; - attach_device(iommu, *domain, *bdf); - DUMP_printk("Using protection domain %d for device %s\n", - (*domain)->id, dev_name(dev)); - } + if (*domain != NULL) + return true; - if (domain_for_device(_bdf) == NULL) - attach_device(iommu, *domain, _bdf); + /* Device not bount yet - bind it */ + dma_dom = find_protection_domain(*bdf); + if (!dma_dom) + dma_dom = iommu->default_dom; + *domain = &dma_dom->domain; + attach_device(dev, *domain); + DUMP_printk("Using protection domain %d for device %s\n", + (*domain)->id, dev_name(dev)); return true; } @@ -2068,7 +2116,7 @@ static void prealloc_protection_domains(void) if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; - if (domain_for_device(devid)) + if (domain_for_device(&dev->dev)) continue; iommu = amd_iommu_rlookup_table[devid]; if (!iommu) @@ -2079,9 +2127,7 @@ static void prealloc_protection_domains(void) init_unity_mappings_for_device(dma_dom, devid); dma_dom->target_dev = devid; - attach_device(iommu, &dma_dom->domain, devid); - if (__devid != devid) - attach_device(iommu, &dma_dom->domain, __devid); + attach_device(&dev->dev, &dma_dom->domain); list_add_tail(&dma_dom->list, &iommu_pd_list); } @@ -2174,7 +2220,7 @@ static void cleanup_domain(struct protection_domain *domain) for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) if (amd_iommu_pd_table[devid] == domain) - __detach_device(domain, devid); + clear_dte_entry(devid); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } @@ -2262,7 +2308,6 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) static void amd_iommu_detach_device(struct iommu_domain *dom, struct device *dev) { - struct protection_domain *domain = dom->priv; struct amd_iommu *iommu; struct pci_dev *pdev; u16 devid; @@ -2275,7 +2320,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, devid = calc_devid(pdev->bus->number, pdev->devfn); if (devid > 0) - detach_device(domain, devid); + detach_device(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) @@ -2292,6 +2337,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct protection_domain *old_domain; struct amd_iommu *iommu; struct pci_dev *pdev; + int ret; u16 devid; if (dev->bus != &pci_bus_type) @@ -2309,15 +2355,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, if (!iommu) return -EINVAL; - old_domain = domain_for_device(devid); + old_domain = amd_iommu_pd_table[devid]; if (old_domain) - detach_device(old_domain, devid); + detach_device(dev); - attach_device(iommu, domain, devid); + ret = attach_device(dev, domain); iommu_completion_wait(iommu); - return 0; + return ret; } static int amd_iommu_map_range(struct iommu_domain *dom, @@ -2414,8 +2460,9 @@ static struct iommu_ops amd_iommu_ops = { int __init amd_iommu_init_passthrough(void) { + struct amd_iommu *iommu; struct pci_dev *dev = NULL; - u16 devid, devid2; + u16 devid; /* allocate passthroug domain */ pt_domain = protection_domain_alloc(); @@ -2425,20 +2472,16 @@ int __init amd_iommu_init_passthrough(void) pt_domain->mode |= PAGE_MODE_NONE; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - struct amd_iommu *iommu; devid = calc_devid(dev->bus->number, dev->devfn); if (devid > amd_iommu_last_bdf) continue; - devid2 = amd_iommu_alias_table[devid]; - - iommu = amd_iommu_rlookup_table[devid2]; + iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; - __attach_device(iommu, pt_domain, devid); - __attach_device(iommu, pt_domain, devid2); + attach_device(&dev->dev, pt_domain); } pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); -- cgit v1.2.1 From 94f6d190eeed91cb2bb901aa7816edd1e2405347 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 16:40:02 +0100 Subject: x86/amd-iommu: Simplify get_device_resources() With the previous changes the get_device_resources function can be simplified even more. The only important information for the callers is the protection domain. This patch renames the function to get_domain() and let it only return the protection domain for a device. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 86 +++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 50 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 75470ffee358..e5bbe9a0c192 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1463,6 +1463,7 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) { struct dma_ops_domain *entry, *ret = NULL; unsigned long flags; + u16 alias = amd_iommu_alias_table[devid]; if (list_empty(&iommu_pd_list)) return NULL; @@ -1470,7 +1471,8 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) spin_lock_irqsave(&iommu_pd_list_lock, flags); list_for_each_entry(entry, &iommu_pd_list, list) { - if (entry->target_dev == devid) { + if (entry->target_dev == devid || + entry->target_dev == alias) { ret = entry; break; } @@ -1488,33 +1490,31 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) * If the device is not yet associated with a domain this is also done * in this function. */ -static bool get_device_resources(struct device *dev, - struct protection_domain **domain, - u16 *bdf) +static struct protection_domain *get_domain(struct device *dev) { + struct protection_domain *domain; struct dma_ops_domain *dma_dom; - struct amd_iommu *iommu; + u16 devid = get_device_id(dev); if (!check_device(dev)) - return false; + return ERR_PTR(-EINVAL); - *bdf = get_device_id(dev); - *domain = domain_for_device(dev); - iommu = amd_iommu_rlookup_table[*bdf]; + domain = domain_for_device(dev); + if (domain != NULL && !dma_ops_domain(domain)) + return ERR_PTR(-EBUSY); - if (*domain != NULL) - return true; + if (domain != NULL) + return domain; /* Device not bount yet - bind it */ - dma_dom = find_protection_domain(*bdf); + dma_dom = find_protection_domain(devid); if (!dma_dom) - dma_dom = iommu->default_dom; - *domain = &dma_dom->domain; - attach_device(dev, *domain); + dma_dom = amd_iommu_rlookup_table[devid]->default_dom; + attach_device(dev, &dma_dom->domain); DUMP_printk("Using protection domain %d for device %s\n", - (*domain)->id, dev_name(dev)); + dma_dom->domain.id, dev_name(dev)); - return true; + return &dma_dom->domain; } static void update_device_table(struct protection_domain *domain) @@ -1825,23 +1825,22 @@ static dma_addr_t map_page(struct device *dev, struct page *page, { unsigned long flags; struct protection_domain *domain; - u16 devid; dma_addr_t addr; u64 dma_mask; phys_addr_t paddr = page_to_phys(page) + offset; INC_STATS_COUNTER(cnt_map_single); - if (!get_device_resources(dev, &domain, &devid)) - /* device not handled by any AMD IOMMU */ + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) return (dma_addr_t)paddr; + else if (IS_ERR(domain)) + return DMA_ERROR_CODE; dma_mask = *dev->dma_mask; - if (!dma_ops_domain(domain)) - return DMA_ERROR_CODE; - spin_lock_irqsave(&domain->lock, flags); + addr = __map_single(dev, domain->priv, paddr, size, dir, false, dma_mask); if (addr == DMA_ERROR_CODE) @@ -1863,15 +1862,11 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, { unsigned long flags; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_unmap_single); - if (!get_device_resources(dev, &domain, &devid)) - /* device not handled by any AMD IOMMU */ - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); @@ -1911,7 +1906,6 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, { unsigned long flags; struct protection_domain *domain; - u16 devid; int i; struct scatterlist *s; phys_addr_t paddr; @@ -1920,14 +1914,14 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, INC_STATS_COUNTER(cnt_map_sg); - if (!get_device_resources(dev, &domain, &devid)) + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) return map_sg_no_iommu(dev, sglist, nelems, dir); + else if (IS_ERR(domain)) + return 0; dma_mask = *dev->dma_mask; - if (!dma_ops_domain(domain)) - return 0; - spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { @@ -1974,15 +1968,12 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, unsigned long flags; struct protection_domain *domain; struct scatterlist *s; - u16 devid; int i; INC_STATS_COUNTER(cnt_unmap_sg); - if (!get_device_resources(dev, &domain, &devid)) - return; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) return; spin_lock_irqsave(&domain->lock, flags); @@ -2007,17 +1998,18 @@ static void *alloc_coherent(struct device *dev, size_t size, unsigned long flags; void *virt_addr; struct protection_domain *domain; - u16 devid; phys_addr_t paddr; u64 dma_mask = dev->coherent_dma_mask; INC_STATS_COUNTER(cnt_alloc_coherent); - if (!get_device_resources(dev, &domain, &devid)) { + domain = get_domain(dev); + if (PTR_ERR(domain) == -EINVAL) { virt_addr = (void *)__get_free_pages(flag, get_order(size)); *dma_addr = __pa(virt_addr); return virt_addr; - } + } else if (IS_ERR(domain)) + return NULL; dma_mask = dev->coherent_dma_mask; flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); @@ -2029,9 +2021,6 @@ static void *alloc_coherent(struct device *dev, size_t size, paddr = virt_to_phys(virt_addr); - if (!dma_ops_domain(domain)) - goto out_free; - if (!dma_mask) dma_mask = *dev->dma_mask; @@ -2066,14 +2055,11 @@ static void free_coherent(struct device *dev, size_t size, { unsigned long flags; struct protection_domain *domain; - u16 devid; INC_STATS_COUNTER(cnt_free_coherent); - if (!get_device_resources(dev, &domain, &devid)) - goto free_mem; - - if (!dma_ops_domain(domain)) + domain = get_domain(dev); + if (IS_ERR(domain)) goto free_mem; spin_lock_irqsave(&domain->lock, flags); -- cgit v1.2.1 From 71c70984e5afc20d304fbb523f1c8bb42c4ceb36 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 16:43:06 +0100 Subject: x86/amd-iommu: Move find_protection_domain to helper functions This is a helper function and when its placed in the helper function section we can remove its forward declaration. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 57 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index e5bbe9a0c192..405f8dad7c77 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -59,7 +59,6 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); -static struct dma_ops_domain *find_protection_domain(u16 devid); static u64 *alloc_pte(struct protection_domain *domain, unsigned long address, int end_lvl, u64 **pte_page, gfp_t gfp); @@ -84,6 +83,34 @@ static inline u16 get_device_id(struct device *dev) return calc_devid(pdev->bus->number, pdev->devfn); } +/* + * In this function the list of preallocated protection domains is traversed to + * find the domain for a specific device + */ +static struct dma_ops_domain *find_protection_domain(u16 devid) +{ + struct dma_ops_domain *entry, *ret = NULL; + unsigned long flags; + u16 alias = amd_iommu_alias_table[devid]; + + if (list_empty(&iommu_pd_list)) + return NULL; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + + list_for_each_entry(entry, &iommu_pd_list, list) { + if (entry->target_dev == devid || + entry->target_dev == alias) { + ret = entry; + break; + } + } + + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + return ret; +} + #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1455,34 +1482,6 @@ static bool check_device(struct device *dev) return true; } -/* - * In this function the list of preallocated protection domains is traversed to - * find the domain for a specific device - */ -static struct dma_ops_domain *find_protection_domain(u16 devid) -{ - struct dma_ops_domain *entry, *ret = NULL; - unsigned long flags; - u16 alias = amd_iommu_alias_table[devid]; - - if (list_empty(&iommu_pd_list)) - return NULL; - - spin_lock_irqsave(&iommu_pd_list_lock, flags); - - list_for_each_entry(entry, &iommu_pd_list, list) { - if (entry->target_dev == devid || - entry->target_dev == alias) { - ret = entry; - break; - } - } - - spin_unlock_irqrestore(&iommu_pd_list_lock, flags); - - return ret; -} - /* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the -- cgit v1.2.1 From 98fc5a693bbdda498a556654c70d1e31a186c988 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 17:19:23 +0100 Subject: x86/amd-iommu: Use get_device_id and check_device where appropriate The logic of these two functions is reimplemented (at least in parts) in places in the code. This patch removes these code duplications and uses the functions instead. As a side effect it moves check_device() to the helper function code section. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 110 ++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 61 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 405f8dad7c77..d10195b685a7 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -111,6 +111,33 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) return ret; } +/* + * This function checks if the driver got a valid device from the caller to + * avoid dereferencing invalid pointers. + */ +static bool check_device(struct device *dev) +{ + u16 devid; + + if (!dev || !dev->dma_mask) + return false; + + /* No device or no PCI device */ + if (!dev || dev->bus != &pci_bus_type) + return false; + + devid = get_device_id(dev); + + /* Out of our scope? */ + if (devid > amd_iommu_last_bdf) + return false; + + if (amd_iommu_rlookup_table[devid] == NULL) + return false; + + return true; +} + #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1386,22 +1413,17 @@ static int device_change_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; - struct pci_dev *pdev = to_pci_dev(dev); - u16 devid = calc_devid(pdev->bus->number, pdev->devfn); + u16 devid; struct protection_domain *domain; struct dma_ops_domain *dma_domain; struct amd_iommu *iommu; unsigned long flags; - if (devid > amd_iommu_last_bdf) - goto out; - - devid = amd_iommu_alias_table[devid]; - - iommu = amd_iommu_rlookup_table[devid]; - if (iommu == NULL) - goto out; + if (!check_device(dev)) + return 0; + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; domain = domain_for_device(dev); if (domain && !dma_ops_domain(domain)) @@ -1452,36 +1474,6 @@ static struct notifier_block device_nb = { * *****************************************************************************/ -/* - * This function checks if the driver got a valid device from the caller to - * avoid dereferencing invalid pointers. - */ -static bool check_device(struct device *dev) -{ - u16 bdf; - struct pci_dev *pcidev; - - if (!dev || !dev->dma_mask) - return false; - - /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) - return false; - - pcidev = to_pci_dev(dev); - - bdf = calc_devid(pcidev->bus->number, pcidev->devfn); - - /* Out of our scope? */ - if (bdf > amd_iommu_last_bdf) - return false; - - if (amd_iommu_rlookup_table[bdf] == NULL) - return false; - - return true; -} - /* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the @@ -2094,15 +2086,20 @@ static void prealloc_protection_domains(void) struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - u16 devid, __devid; + u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - __devid = devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) + + /* Do we handle this device? */ + if (!check_device(&dev->dev)) continue; - devid = amd_iommu_alias_table[devid]; + + /* Is there already any domain for it? */ if (domain_for_device(&dev->dev)) continue; + + devid = get_device_id(&dev->dev); + iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; @@ -2294,17 +2291,14 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, struct device *dev) { struct amd_iommu *iommu; - struct pci_dev *pdev; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return; - pdev = to_pci_dev(dev); - - devid = calc_devid(pdev->bus->number, pdev->devfn); + devid = get_device_id(dev); - if (devid > 0) + if (amd_iommu_pd_table[devid] != NULL) detach_device(dev); iommu = amd_iommu_rlookup_table[devid]; @@ -2321,20 +2315,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct protection_domain *domain = dom->priv; struct protection_domain *old_domain; struct amd_iommu *iommu; - struct pci_dev *pdev; int ret; u16 devid; - if (dev->bus != &pci_bus_type) + if (!check_device(dev)) return -EINVAL; - pdev = to_pci_dev(dev); - - devid = calc_devid(pdev->bus->number, pdev->devfn); - - if (devid >= amd_iommu_last_bdf || - devid != amd_iommu_alias_table[devid]) - return -EINVAL; + devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) @@ -2458,10 +2445,11 @@ int __init amd_iommu_init_passthrough(void) while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = calc_devid(dev->bus->number, dev->devfn); - if (devid > amd_iommu_last_bdf) + if (!check_device(&dev->dev)) continue; + devid = get_device_id(&dev->dev); + iommu = amd_iommu_rlookup_table[devid]; if (!iommu) continue; -- cgit v1.2.1 From 87a64d523825351a23743e69949c2a8c2077cecf Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 17:26:43 +0100 Subject: x86/amd-iommu: Remove iommu parameter from dma_ops_domain_alloc This function doesn't use the parameter anymore so it can be removed. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d10195b685a7..17e83ecb8b22 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1188,7 +1188,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) * It also intializes the page table and the address allocator data * structures required for the dma_ops interface */ -static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) +static struct dma_ops_domain *dma_ops_domain_alloc(void) { struct dma_ops_domain *dma_dom; @@ -1443,7 +1443,7 @@ static int device_change_notifier(struct notifier_block *nb, dma_domain = find_protection_domain(devid); if (dma_domain) goto out; - dma_domain = dma_ops_domain_alloc(iommu); + dma_domain = dma_ops_domain_alloc(); if (!dma_domain) goto out; dma_domain->target_dev = devid; @@ -2085,7 +2085,6 @@ static void prealloc_protection_domains(void) { struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; - struct amd_iommu *iommu; u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { @@ -2100,10 +2099,7 @@ static void prealloc_protection_domains(void) devid = get_device_id(&dev->dev); - iommu = amd_iommu_rlookup_table[devid]; - if (!iommu) - continue; - dma_dom = dma_ops_domain_alloc(iommu); + dma_dom = dma_ops_domain_alloc(); if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); @@ -2139,7 +2135,7 @@ int __init amd_iommu_init_dma_ops(void) * protection domain will be assigned to the default one. */ for_each_iommu(iommu) { - iommu->default_dom = dma_ops_domain_alloc(iommu); + iommu->default_dom = dma_ops_domain_alloc(); if (iommu->default_dom == NULL) return -ENOMEM; iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; -- cgit v1.2.1 From 308973d3b958b9328a1051642c81ee6dbc5021a4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 17:43:32 +0100 Subject: x86/amd-iommu: Move some pte allocation functions in the right section This patch moves alloc_pte() and fetch_pte() into the page table handling code section so that the forward declarations for them could be removed. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 193 +++++++++++++++++++++----------------------- 1 file changed, 94 insertions(+), 99 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 17e83ecb8b22..90b365024c24 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -59,15 +59,10 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, int end_lvl, - u64 **pte_page, gfp_t gfp); static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages); static void reset_iommu_command_buffer(struct amd_iommu *iommu); -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size); static void update_domain(struct protection_domain *domain); /**************************************************************************** @@ -664,6 +659,100 @@ void amd_iommu_flush_all_devices(void) * ****************************************************************************/ +/* + * This function is used to add another level to an IO page table. Adding + * another level increases the size of the address space by 9 bits to a size up + * to 64 bits. + */ +static bool increase_address_space(struct protection_domain *domain, + gfp_t gfp) +{ + u64 *pte; + + if (domain->mode == PAGE_MODE_6_LEVEL) + /* address space already 64 bit large */ + return false; + + pte = (void *)get_zeroed_page(gfp); + if (!pte) + return false; + + *pte = PM_LEVEL_PDE(domain->mode, + virt_to_phys(domain->pt_root)); + domain->pt_root = pte; + domain->mode += 1; + domain->updated = true; + + return true; +} + +static u64 *alloc_pte(struct protection_domain *domain, + unsigned long address, + int end_lvl, + u64 **pte_page, + gfp_t gfp) +{ + u64 *pte, *page; + int level; + + while (address > PM_LEVEL_SIZE(domain->mode)) + increase_address_space(domain, gfp); + + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + + while (level > end_lvl) { + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(gfp); + if (!page) + return NULL; + *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); + } + + level -= 1; + + pte = IOMMU_PTE_PAGE(*pte); + + if (pte_page && level == end_lvl) + *pte_page = pte; + + pte = &pte[PM_LEVEL_INDEX(level, address)]; + } + + return pte; +} + +/* + * This function checks if there is a PTE for a given dma address. If + * there is one, it returns the pointer to it. + */ +static u64 *fetch_pte(struct protection_domain *domain, + unsigned long address, int map_size) +{ + int level; + u64 *pte; + + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + + while (level > map_size) { + if (!IOMMU_PTE_PRESENT(*pte)) + return NULL; + + level -= 1; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[PM_LEVEL_INDEX(level, address)]; + + if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { + pte = NULL; + break; + } + } + + return pte; +} + /* * Generic mapping functions. It maps a physical address into a DMA * address space. It allocates the page table pages if necessary. @@ -819,37 +908,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * called with domain->lock held */ -/* - * This function checks if there is a PTE for a given dma address. If - * there is one, it returns the pointer to it. - */ -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size) -{ - int level; - u64 *pte; - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > map_size) { - if (!IOMMU_PTE_PRESENT(*pte)) - return NULL; - - level -= 1; - - pte = IOMMU_PTE_PAGE(*pte); - pte = &pte[PM_LEVEL_INDEX(level, address)]; - - if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { - pte = NULL; - break; - } - } - - return pte; -} - /* * This function is used to add a new aperture range to an existing * aperture in case of dma_ops domain allocation or address allocation @@ -1534,69 +1592,6 @@ static void update_domain(struct protection_domain *domain) domain->updated = false; } -/* - * This function is used to add another level to an IO page table. Adding - * another level increases the size of the address space by 9 bits to a size up - * to 64 bits. - */ -static bool increase_address_space(struct protection_domain *domain, - gfp_t gfp) -{ - u64 *pte; - - if (domain->mode == PAGE_MODE_6_LEVEL) - /* address space already 64 bit large */ - return false; - - pte = (void *)get_zeroed_page(gfp); - if (!pte) - return false; - - *pte = PM_LEVEL_PDE(domain->mode, - virt_to_phys(domain->pt_root)); - domain->pt_root = pte; - domain->mode += 1; - domain->updated = true; - - return true; -} - -static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, - int end_lvl, - u64 **pte_page, - gfp_t gfp) -{ - u64 *pte, *page; - int level; - - while (address > PM_LEVEL_SIZE(domain->mode)) - increase_address_space(domain, gfp); - - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; - - while (level > end_lvl) { - if (!IOMMU_PTE_PRESENT(*pte)) { - page = (u64 *)get_zeroed_page(gfp); - if (!page) - return NULL; - *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); - } - - level -= 1; - - pte = IOMMU_PTE_PAGE(*pte); - - if (pte_page && level == end_lvl) - *pte_page = pte; - - pte = &pte[PM_LEVEL_INDEX(level, address)]; - } - - return pte; -} - /* * This function fetches the PTE for a given address in the aperture */ -- cgit v1.2.1 From 171e7b3739e175eea7b32eca9dbe189589e14a28 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 24 Nov 2009 17:47:56 +0100 Subject: x86/amd-iommu: Rearrange dma_ops related functions This patch rearranges two dma_ops related functions so that their forward declarations are not longer necessary. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 89 +++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 47 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 90b365024c24..14b60c0cdc70 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -57,11 +57,6 @@ struct iommu_cmd { u32 data[4]; }; -static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, - struct unity_map_entry *e); -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages); static void reset_iommu_command_buffer(struct amd_iommu *iommu); static void update_domain(struct protection_domain *domain); @@ -822,28 +817,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu, return 0; } -/* - * Init the unity mappings for a specific IOMMU in the system - * - * Basically iterates over all unity mapping entries and applies them to - * the default domain DMA of that IOMMU if necessary. - */ -static int iommu_init_unity_mappings(struct amd_iommu *iommu) -{ - struct unity_map_entry *entry; - int ret; - - list_for_each_entry(entry, &amd_iommu_unity_map, list) { - if (!iommu_for_unity_map(iommu, entry)) - continue; - ret = dma_ops_unity_map(iommu->default_dom, entry); - if (ret) - return ret; - } - - return 0; -} - /* * This function actually applies the mapping to the page table of the * dma_ops domain. @@ -872,6 +845,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, return 0; } +/* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ +static int iommu_init_unity_mappings(struct amd_iommu *iommu) +{ + struct unity_map_entry *entry; + int ret; + + list_for_each_entry(entry, &amd_iommu_unity_map, list) { + if (!iommu_for_unity_map(iommu, entry)) + continue; + ret = dma_ops_unity_map(iommu->default_dom, entry); + if (ret) + return ret; + } + + return 0; +} + /* * Inits the unity mappings required for a specific device */ @@ -908,6 +903,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * called with domain->lock held */ +/* + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. + */ +static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages) +{ + unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; + + if (start_page + pages > last_page) + pages = last_page - start_page; + + for (i = start_page; i < start_page + pages; ++i) { + int index = i / APERTURE_RANGE_PAGES; + int page = i % APERTURE_RANGE_PAGES; + __set_bit(page, dom->aperture[index]->bitmap); + } +} + /* * This function is used to add a new aperture range to an existing * aperture in case of dma_ops domain allocation or address allocation @@ -1166,26 +1181,6 @@ static void domain_id_free(int id) write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } -/* - * Used to reserve address ranges in the aperture (e.g. for exclusion - * ranges. - */ -static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, - unsigned long start_page, - unsigned int pages) -{ - unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT; - - if (start_page + pages > last_page) - pages = last_page - start_page; - - for (i = start_page; i < start_page + pages; ++i) { - int index = i / APERTURE_RANGE_PAGES; - int page = i % APERTURE_RANGE_PAGES; - __set_bit(page, dom->aperture[index]->bitmap); - } -} - static void free_pagetable(struct protection_domain *domain) { int i, j; -- cgit v1.2.1 From 8793abeb783c12cc37f92f6133fd6468152b98df Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 27 Nov 2009 11:40:33 +0100 Subject: x86/amd-iommu: Remove support for domain sharing This patch makes device isolation mandatory and removes support for the amd_iommu=share option. This simplifies the code in several places. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 10 ++-------- arch/x86/kernel/amd_iommu_init.c | 17 ----------------- 2 files changed, 2 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 14b60c0cdc70..ed58a1688391 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -148,7 +148,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem); DECLARE_STATS_COUNTER(total_map_requests); static struct dentry *stats_dir; -static struct dentry *de_isolate; static struct dentry *de_fflush; static void amd_iommu_stats_add(struct __iommu_counter *cnt) @@ -166,9 +165,6 @@ static void amd_iommu_stats_init(void) if (stats_dir == NULL) return; - de_isolate = debugfs_create_bool("isolation", 0444, stats_dir, - (u32 *)&amd_iommu_isolate); - de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, (u32 *)&amd_iommu_unmap_flush); @@ -2135,11 +2131,9 @@ int __init amd_iommu_init_dma_ops(void) } /* - * If device isolation is enabled, pre-allocate the protection - * domains for each device. + * Pre-allocate the protection domains for each device. */ - if (amd_iommu_isolate) - prealloc_protection_domains(); + prealloc_protection_domains(); iommu_detected = 1; swiotlb = 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index fbe4c3c02a91..fe1686f6f91b 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -125,13 +125,6 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ -#ifdef CONFIG_IOMMU_STRESS -bool amd_iommu_isolate = false; -#else -bool amd_iommu_isolate = true; /* if true, device isolation is - enabled */ -#endif - bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the @@ -1308,12 +1301,6 @@ static int __init amd_iommu_init(void) if (iommu_pass_through) goto out; - printk(KERN_INFO "AMD-Vi: device isolation "); - if (amd_iommu_isolate) - printk("enabled\n"); - else - printk("disabled\n"); - if (amd_iommu_unmap_flush) printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); else @@ -1387,10 +1374,6 @@ static int __init parse_amd_iommu_dump(char *str) static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strncmp(str, "isolate", 7) == 0) - amd_iommu_isolate = true; - if (strncmp(str, "share", 5) == 0) - amd_iommu_isolate = false; if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; } -- cgit v1.2.1 From 657cbb6b6cba0f9c98c5299e0c803b2c0e67ea0a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 23 Nov 2009 15:26:46 +0100 Subject: x86/amd-iommu: Use dev->arch->iommu to store iommu related information This patch changes IOMMU code to use dev->archdata->iommu to store information about the alias device and the domain the device is attached to. This allows the driver to get rid of the amd_iommu_pd_table in the future. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 109 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 86 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index ed58a1688391..3214e8806f95 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -73,6 +73,11 @@ static inline u16 get_device_id(struct device *dev) return calc_devid(pdev->bus->number, pdev->devfn); } +static struct iommu_dev_data *get_dev_data(struct device *dev) +{ + return dev->archdata.iommu; +} + /* * In this function the list of preallocated protection domains is traversed to * find the domain for a specific device @@ -128,6 +133,35 @@ static bool check_device(struct device *dev) return true; } +static int iommu_init_device(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct pci_dev *pdev; + u16 devid, alias; + + if (dev->archdata.iommu) + return 0; + + dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return -ENOMEM; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); + if (pdev) + dev_data->alias = &pdev->dev; + + dev->archdata.iommu = dev_data; + + + return 0; +} + +static void iommu_uninit_device(struct device *dev) +{ + kfree(dev->archdata.iommu); +} #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -1346,28 +1380,39 @@ static void clear_dte_entry(u16 devid) static int __attach_device(struct device *dev, struct protection_domain *domain) { - u16 devid = get_device_id(dev); - u16 alias = amd_iommu_alias_table[devid]; + struct iommu_dev_data *dev_data, *alias_data; + u16 devid, alias; + + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + if (!alias_data) + return -EINVAL; /* lock domain */ spin_lock(&domain->lock); /* Some sanity checks */ - if (amd_iommu_pd_table[alias] != NULL && - amd_iommu_pd_table[alias] != domain) + if (alias_data->domain != NULL && + alias_data->domain != domain) return -EBUSY; - if (amd_iommu_pd_table[devid] != NULL && - amd_iommu_pd_table[devid] != domain) + if (dev_data->domain != NULL && + dev_data->domain != domain) return -EBUSY; /* Do real assignment */ if (alias != devid && - amd_iommu_pd_table[alias] == NULL) + alias_data->domain == NULL) { + alias_data->domain = domain; set_dte_entry(alias, domain); + } - if (amd_iommu_pd_table[devid] == NULL) + if (dev_data->domain == NULL) { + dev_data->domain = domain; set_dte_entry(devid, domain); + } /* ready */ spin_unlock(&domain->lock); @@ -1406,10 +1451,12 @@ static void __detach_device(struct device *dev) { u16 devid = get_device_id(dev); struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + struct iommu_dev_data *dev_data = get_dev_data(dev); BUG_ON(!iommu); clear_dte_entry(devid); + dev_data->domain = NULL; /* * If we run in passthrough mode the device must be assigned to the @@ -1439,18 +1486,23 @@ static void detach_device(struct device *dev) static struct protection_domain *domain_for_device(struct device *dev) { struct protection_domain *dom; + struct iommu_dev_data *dev_data, *alias_data; unsigned long flags; u16 devid, alias; - devid = get_device_id(dev); - alias = amd_iommu_alias_table[devid]; + devid = get_device_id(dev); + alias = amd_iommu_alias_table[devid]; + dev_data = get_dev_data(dev); + alias_data = get_dev_data(dev_data->alias); + if (!alias_data) + return NULL; read_lock_irqsave(&amd_iommu_devtable_lock, flags); - dom = amd_iommu_pd_table[devid]; + dom = dev_data->domain; if (dom == NULL && - amd_iommu_pd_table[alias] != NULL) { - __attach_device(dev, amd_iommu_pd_table[alias]); - dom = amd_iommu_pd_table[devid]; + alias_data->domain != NULL) { + __attach_device(dev, alias_data->domain); + dom = alias_data->domain; } read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); @@ -1473,14 +1525,12 @@ static int device_change_notifier(struct notifier_block *nb, devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; - domain = domain_for_device(dev); - - if (domain && !dma_ops_domain(domain)) - WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " - "to a non-dma-ops domain\n", dev_name(dev)); switch (action) { case BUS_NOTIFY_UNBOUND_DRIVER: + + domain = domain_for_device(dev); + if (!domain) goto out; if (iommu_pass_through) @@ -1488,6 +1538,11 @@ static int device_change_notifier(struct notifier_block *nb, detach_device(dev); break; case BUS_NOTIFY_ADD_DEVICE: + + iommu_init_device(dev); + + domain = domain_for_device(dev); + /* allocate a protection domain if a device is added */ dma_domain = find_protection_domain(devid); if (dma_domain) @@ -1502,6 +1557,10 @@ static int device_change_notifier(struct notifier_block *nb, spin_unlock_irqrestore(&iommu_pd_list_lock, flags); break; + case BUS_NOTIFY_DEL_DEVICE: + + iommu_uninit_device(dev); + default: goto out; } @@ -2079,6 +2138,8 @@ static void prealloc_protection_domains(void) if (!check_device(&dev->dev)) continue; + iommu_init_device(&dev->dev); + /* Is there already any domain for it? */ if (domain_for_device(&dev->dev)) continue; @@ -2270,6 +2331,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) static void amd_iommu_detach_device(struct iommu_domain *dom, struct device *dev) { + struct iommu_dev_data *dev_data = dev->archdata.iommu; struct amd_iommu *iommu; u16 devid; @@ -2278,7 +2340,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, devid = get_device_id(dev); - if (amd_iommu_pd_table[devid] != NULL) + if (dev_data->domain != NULL) detach_device(dev); iommu = amd_iommu_rlookup_table[devid]; @@ -2293,7 +2355,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { struct protection_domain *domain = dom->priv; - struct protection_domain *old_domain; + struct iommu_dev_data *dev_data; struct amd_iommu *iommu; int ret; u16 devid; @@ -2301,14 +2363,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, if (!check_device(dev)) return -EINVAL; + dev_data = dev->archdata.iommu; + devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; if (!iommu) return -EINVAL; - old_domain = amd_iommu_pd_table[devid]; - if (old_domain) + if (dev_data->domain) detach_device(dev); ret = attach_device(dev, domain); -- cgit v1.2.1 From 241000556f751dacd332df6ab2e903a23746e51e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 25 Nov 2009 15:59:57 +0100 Subject: x86/amd-iommu: Add device bind reference counting This patch adds a reference count to each device to count how often the device was bound to that domain. This is important for single devices that act as an alias for a number of others. These devices must stay bound to their domains until all devices that alias to it are unbound from the same domain. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 3214e8806f95..f5db7d5e444e 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -152,6 +152,8 @@ static int iommu_init_device(struct device *dev) if (pdev) dev_data->alias = &pdev->dev; + atomic_set(&dev_data->bind, 0); + dev->archdata.iommu = dev_data; @@ -1403,10 +1405,13 @@ static int __attach_device(struct device *dev, return -EBUSY; /* Do real assignment */ - if (alias != devid && - alias_data->domain == NULL) { - alias_data->domain = domain; - set_dte_entry(alias, domain); + if (alias != devid) { + if (alias_data->domain == NULL) { + alias_data->domain = domain; + set_dte_entry(alias, domain); + } + + atomic_inc(&alias_data->bind); } if (dev_data->domain == NULL) { @@ -1414,6 +1419,8 @@ static int __attach_device(struct device *dev, set_dte_entry(devid, domain); } + atomic_inc(&dev_data->bind); + /* ready */ spin_unlock(&domain->lock); @@ -1449,20 +1456,34 @@ static int attach_device(struct device *dev, */ static void __detach_device(struct device *dev) { - u16 devid = get_device_id(dev); + u16 devid = get_device_id(dev), alias; struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; struct iommu_dev_data *dev_data = get_dev_data(dev); + struct iommu_dev_data *alias_data; BUG_ON(!iommu); - clear_dte_entry(devid); - dev_data->domain = NULL; + devid = get_device_id(dev); + alias = get_device_id(dev_data->alias); + + if (devid != alias) { + alias_data = get_dev_data(dev_data->alias); + if (atomic_dec_and_test(&alias_data->bind)) { + clear_dte_entry(alias); + alias_data->domain = NULL; + } + } + + if (atomic_dec_and_test(&dev_data->bind)) { + clear_dte_entry(devid); + dev_data->domain = NULL; + } /* * If we run in passthrough mode the device must be assigned to the * passthrough domain if it is detached from any other domain */ - if (iommu_pass_through) + if (iommu_pass_through && dev_data->domain == NULL) __attach_device(dev, pt_domain); } -- cgit v1.2.1 From 7c392cbe984d904f7c89a6a75b2ac245254e8da5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Nov 2009 11:13:32 +0100 Subject: x86/amd-iommu: Keep devices per domain in a list This patch introduces a list to each protection domain which keeps all devices associated with the domain. This can be used later to optimize certain functions and to completly remove the amd_iommu_pd_table. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f5db7d5e444e..530d6080940f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1286,6 +1286,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(void) dma_dom->domain.id = domain_id_alloc(); if (dma_dom->domain.id == 0) goto free_dma_dom; + INIT_LIST_HEAD(&dma_dom->domain.dev_list); dma_dom->domain.mode = PAGE_MODE_2_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); dma_dom->domain.flags = PD_DMA_OPS_MASK; @@ -1408,6 +1409,7 @@ static int __attach_device(struct device *dev, if (alias != devid) { if (alias_data->domain == NULL) { alias_data->domain = domain; + list_add(&alias_data->list, &domain->dev_list); set_dte_entry(alias, domain); } @@ -1416,6 +1418,7 @@ static int __attach_device(struct device *dev, if (dev_data->domain == NULL) { dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); set_dte_entry(devid, domain); } @@ -1460,6 +1463,7 @@ static void __detach_device(struct device *dev) struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; struct iommu_dev_data *dev_data = get_dev_data(dev); struct iommu_dev_data *alias_data; + unsigned long flags; BUG_ON(!iommu); @@ -1469,13 +1473,19 @@ static void __detach_device(struct device *dev) if (devid != alias) { alias_data = get_dev_data(dev_data->alias); if (atomic_dec_and_test(&alias_data->bind)) { + spin_lock_irqsave(&alias_data->domain->lock, flags); clear_dte_entry(alias); + list_del(&alias_data->list); + spin_unlock_irqrestore(&alias_data->domain->lock, flags); alias_data->domain = NULL; } } if (atomic_dec_and_test(&dev_data->bind)) { + spin_lock_irqsave(&dev_data->domain->lock, flags); clear_dte_entry(devid); + list_del(&dev_data->list); + spin_unlock_irqrestore(&dev_data->domain->lock, flags); dev_data->domain = NULL; } @@ -2294,6 +2304,7 @@ static struct protection_domain *protection_domain_alloc(void) domain->id = domain_id_alloc(); if (!domain->id) goto out_err; + INIT_LIST_HEAD(&domain->dev_list); add_domain_to_list(domain); -- cgit v1.2.1 From 7f760ddd702d162d693bc79f62c3bdd7fe55bd9d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Nov 2009 14:49:59 +0100 Subject: x86/amd-iommu: Cleanup attach/detach_device code This patch cleans up the attach_device and detach_device paths and fixes reference counting while at it. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 102 +++++++++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 530d6080940f..e3363fd5eef5 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1329,7 +1329,6 @@ static bool dma_ops_domain(struct protection_domain *domain) static void set_dte_entry(u16 devid, struct protection_domain *domain) { - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; u64 pte_root = virt_to_phys(domain->pt_root); BUG_ON(amd_iommu_pd_table[devid] != NULL); @@ -1344,18 +1343,11 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) amd_iommu_pd_table[devid] = domain; - /* Do reference counting */ - domain->dev_iommu[iommu->index] += 1; - domain->dev_cnt += 1; - - /* Flush the changes DTE entry */ - iommu_queue_inv_dev_entry(iommu, devid); } static void clear_dte_entry(u16 devid) { struct protection_domain *domain = amd_iommu_pd_table[devid]; - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; BUG_ON(domain == NULL); @@ -1368,11 +1360,51 @@ static void clear_dte_entry(u16 devid) amd_iommu_dev_table[devid].data[2] = 0; amd_iommu_apply_erratum_63(devid); +} + +static void do_attach(struct device *dev, struct protection_domain *domain) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); + + /* Update data structures */ + dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); + set_dte_entry(devid, domain); + + /* Do reference counting */ + domain->dev_iommu[iommu->index] += 1; + domain->dev_cnt += 1; + + /* Flush the DTE entry */ + iommu_queue_inv_dev_entry(iommu, devid); +} + +static void do_detach(struct device *dev) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); /* decrease reference counters */ - domain->dev_iommu[iommu->index] -= 1; - domain->dev_cnt -= 1; + dev_data->domain->dev_iommu[iommu->index] -= 1; + dev_data->domain->dev_cnt -= 1; + + /* Update data structures */ + dev_data->domain = NULL; + list_del(&dev_data->list); + clear_dte_entry(devid); + /* Flush the DTE entry */ iommu_queue_inv_dev_entry(iommu, devid); } @@ -1384,12 +1416,10 @@ static int __attach_device(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data, *alias_data; - u16 devid, alias; - devid = get_device_id(dev); - alias = amd_iommu_alias_table[devid]; dev_data = get_dev_data(dev); alias_data = get_dev_data(dev_data->alias); + if (!alias_data) return -EINVAL; @@ -1406,21 +1436,16 @@ static int __attach_device(struct device *dev, return -EBUSY; /* Do real assignment */ - if (alias != devid) { - if (alias_data->domain == NULL) { - alias_data->domain = domain; - list_add(&alias_data->list, &domain->dev_list); - set_dte_entry(alias, domain); - } + if (dev_data->alias != dev) { + alias_data = get_dev_data(dev_data->alias); + if (alias_data->domain == NULL) + do_attach(dev_data->alias, domain); atomic_inc(&alias_data->bind); } - if (dev_data->domain == NULL) { - dev_data->domain = domain; - list_add(&dev_data->list, &domain->dev_list); - set_dte_entry(devid, domain); - } + if (dev_data->domain == NULL) + do_attach(dev, domain); atomic_inc(&dev_data->bind); @@ -1459,35 +1484,24 @@ static int attach_device(struct device *dev, */ static void __detach_device(struct device *dev) { - u16 devid = get_device_id(dev), alias; - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; struct iommu_dev_data *dev_data = get_dev_data(dev); struct iommu_dev_data *alias_data; unsigned long flags; - BUG_ON(!iommu); + BUG_ON(!dev_data->domain); - devid = get_device_id(dev); - alias = get_device_id(dev_data->alias); + spin_lock_irqsave(&dev_data->domain->lock, flags); - if (devid != alias) { + if (dev_data->alias != dev) { alias_data = get_dev_data(dev_data->alias); - if (atomic_dec_and_test(&alias_data->bind)) { - spin_lock_irqsave(&alias_data->domain->lock, flags); - clear_dte_entry(alias); - list_del(&alias_data->list); - spin_unlock_irqrestore(&alias_data->domain->lock, flags); - alias_data->domain = NULL; - } + if (atomic_dec_and_test(&alias_data->bind)) + do_detach(dev_data->alias); } - if (atomic_dec_and_test(&dev_data->bind)) { - spin_lock_irqsave(&dev_data->domain->lock, flags); - clear_dte_entry(devid); - list_del(&dev_data->list); - spin_unlock_irqrestore(&dev_data->domain->lock, flags); - dev_data->domain = NULL; - } + if (atomic_dec_and_test(&dev_data->bind)) + do_detach(dev); + + spin_unlock_irqrestore(&dev_data->domain->lock, flags); /* * If we run in passthrough mode the device must be assigned to the -- cgit v1.2.1 From 3fa43655d81d471d47c44b0db4e2be1f8af32207 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Nov 2009 15:04:38 +0100 Subject: x86/amd-iommu: Introduce iommu_flush_device() function This patch adds a function to flush a DTE entry for a given struct device and replaces iommu_queue_inv_dev_entry calls with this function where appropriate. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index e3363fd5eef5..41c4ebecced4 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -494,6 +494,17 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) return ret; } +static int iommu_flush_device(struct device *dev) +{ + struct amd_iommu *iommu; + u16 devid; + + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + + return iommu_queue_inv_dev_entry(iommu, devid); +} + static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, u16 domid, int pde, int s) { @@ -1382,7 +1393,7 @@ static void do_attach(struct device *dev, struct protection_domain *domain) domain->dev_cnt += 1; /* Flush the DTE entry */ - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); } static void do_detach(struct device *dev) @@ -1405,7 +1416,7 @@ static void do_detach(struct device *dev) clear_dte_entry(devid); /* Flush the DTE entry */ - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); } /* @@ -1610,7 +1621,7 @@ static int device_change_notifier(struct notifier_block *nb, goto out; } - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); out: @@ -2393,7 +2404,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, if (!iommu) return; - iommu_queue_inv_dev_entry(iommu, devid); + iommu_flush_device(dev); iommu_completion_wait(iommu); } -- cgit v1.2.1 From b00d3bcff4d996f65e337d404b0df5dc201a01ab Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Nov 2009 15:35:33 +0100 Subject: x86/amd-iommu: Cleanup DTE flushing code This patch cleans up the code to flush device table entries in the IOMMU. With this chance the driver can get rid of the iommu_queue_inv_dev_entry() function. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 100 +++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 66 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 41c4ebecced4..0eafca58926f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -146,6 +146,8 @@ static int iommu_init_device(struct device *dev) if (!dev_data) return -ENOMEM; + dev_data->dev = dev; + devid = get_device_id(dev); alias = amd_iommu_alias_table[devid]; pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); @@ -478,31 +480,21 @@ static void iommu_flush_complete(struct protection_domain *domain) /* * Command send function for invalidating a device table entry */ -static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) -{ - struct iommu_cmd cmd; - int ret; - - BUG_ON(iommu == NULL); - - memset(&cmd, 0, sizeof(cmd)); - CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); - cmd.data[0] = devid; - - ret = iommu_queue_command(iommu, &cmd); - - return ret; -} - static int iommu_flush_device(struct device *dev) { struct amd_iommu *iommu; + struct iommu_cmd cmd; u16 devid; devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; - return iommu_queue_inv_dev_entry(iommu, devid); + /* Build command */ + memset(&cmd, 0, sizeof(cmd)); + CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); + cmd.data[0] = devid; + + return iommu_queue_command(iommu, &cmd); } static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, @@ -592,30 +584,43 @@ static void iommu_flush_tlb_pde(struct protection_domain *domain) __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } + /* - * This function flushes all domains that have devices on the given IOMMU + * This function flushes the DTEs for all devices in domain */ -static void flush_all_domains_on_iommu(struct amd_iommu *iommu) +static void iommu_flush_domain_devices(struct protection_domain *domain) +{ + struct iommu_dev_data *dev_data; + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); + + list_for_each_entry(dev_data, &domain->dev_list, list) + iommu_flush_device(dev_data->dev); + + spin_unlock_irqrestore(&domain->lock, flags); +} + +static void iommu_flush_all_domain_devices(void) { - u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; struct protection_domain *domain; unsigned long flags; spin_lock_irqsave(&amd_iommu_pd_lock, flags); list_for_each_entry(domain, &amd_iommu_pd_list, list) { - if (domain->dev_iommu[iommu->index] == 0) - continue; - - spin_lock(&domain->lock); - iommu_queue_inv_iommu_pages(iommu, address, domain->id, 1, 1); + iommu_flush_domain_devices(domain); iommu_flush_complete(domain); - spin_unlock(&domain->lock); } spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } +void amd_iommu_flush_all_devices(void) +{ + iommu_flush_all_domain_devices(); +} + /* * This function uses heavy locking and may disable irqs for some time. But * this is no issue because it is only called during resume. @@ -637,38 +642,6 @@ void amd_iommu_flush_all_domains(void) spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } -static void flush_all_devices_for_iommu(struct amd_iommu *iommu) -{ - int i; - - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (iommu != amd_iommu_rlookup_table[i]) - continue; - - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); - } -} - -static void flush_devices_by_domain(struct protection_domain *domain) -{ - struct amd_iommu *iommu; - int i; - - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || - (amd_iommu_pd_table[i] != domain)) - continue; - - iommu = amd_iommu_rlookup_table[i]; - if (!iommu) - continue; - - iommu_queue_inv_dev_entry(iommu, i); - iommu_completion_wait(iommu); - } -} - static void reset_iommu_command_buffer(struct amd_iommu *iommu) { pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); @@ -679,17 +652,12 @@ static void reset_iommu_command_buffer(struct amd_iommu *iommu) iommu->reset_in_progress = true; amd_iommu_reset_cmd_buffer(iommu); - flush_all_devices_for_iommu(iommu); - flush_all_domains_on_iommu(iommu); + amd_iommu_flush_all_devices(); + amd_iommu_flush_all_domains(); iommu->reset_in_progress = false; } -void amd_iommu_flush_all_devices(void) -{ - flush_devices_by_domain(NULL); -} - /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -1692,7 +1660,7 @@ static void update_domain(struct protection_domain *domain) return; update_device_table(domain); - flush_devices_by_domain(domain); + iommu_flush_domain_devices(domain); iommu_flush_tlb_pde(domain); domain->updated = false; -- cgit v1.2.1 From 8eed9833346781dd15e3bef35a91b0a40787ea3c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 26 Nov 2009 15:45:41 +0100 Subject: x86/amd-iommu: Move reset_iommu_command_buffer out of locked code This patch removes the ugly contruct where the iommu->lock must be released while before calling the reset_iommu_command_buffer function. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0eafca58926f..b75fcd9b6a0f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -285,6 +285,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + iommu->reset_in_progress = true; reset_iommu_command_buffer(iommu); dump_command(address); break; @@ -407,11 +408,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu) status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - if (unlikely(i == EXIT_LOOP_COUNT)) { - spin_unlock(&iommu->lock); - reset_iommu_command_buffer(iommu); - spin_lock(&iommu->lock); - } + if (unlikely(i == EXIT_LOOP_COUNT)) + iommu->reset_in_progress = true; } /* @@ -458,6 +456,9 @@ static int iommu_completion_wait(struct amd_iommu *iommu) out: spin_unlock_irqrestore(&iommu->lock, flags); + if (iommu->reset_in_progress) + reset_iommu_command_buffer(iommu); + return 0; } @@ -649,8 +650,6 @@ static void reset_iommu_command_buffer(struct amd_iommu *iommu) if (iommu->reset_in_progress) panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); - iommu->reset_in_progress = true; - amd_iommu_reset_cmd_buffer(iommu); amd_iommu_flush_all_devices(); amd_iommu_flush_all_domains(); -- cgit v1.2.1 From 492667dacc0ac9763969155482b1261b34ccf450 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 27 Nov 2009 13:25:47 +0100 Subject: x86/amd-iommu: Remove amd_iommu_pd_table The data that was stored in this table is now available in dev->archdata.iommu. So this table is not longer necessary. This patch removes the remaining uses of that variable and removes it from the code. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 35 +++++++++++------------------------ arch/x86/kernel/amd_iommu_init.c | 18 ------------------ 2 files changed, 11 insertions(+), 42 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b75fcd9b6a0f..32fb09102a13 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1309,8 +1309,6 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) { u64 pte_root = virt_to_phys(domain->pt_root); - BUG_ON(amd_iommu_pd_table[devid] != NULL); - pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; @@ -1318,20 +1316,10 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain) amd_iommu_dev_table[devid].data[2] = domain->id; amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); - - amd_iommu_pd_table[devid] = domain; - } static void clear_dte_entry(u16 devid) { - struct protection_domain *domain = amd_iommu_pd_table[devid]; - - BUG_ON(domain == NULL); - - /* remove domain from the lookup table */ - amd_iommu_pd_table[devid] = NULL; - /* remove entry from the device table seen by the hardware */ amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; amd_iommu_dev_table[devid].data[1] = 0; @@ -1641,15 +1629,11 @@ static struct protection_domain *get_domain(struct device *dev) static void update_device_table(struct protection_domain *domain) { - unsigned long flags; - int i; + struct iommu_dev_data *dev_data; - for (i = 0; i <= amd_iommu_last_bdf; ++i) { - if (amd_iommu_pd_table[i] != domain) - continue; - write_lock_irqsave(&amd_iommu_devtable_lock, flags); - set_dte_entry(i, domain); - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + list_for_each_entry(dev_data, &domain->dev_list, list) { + u16 devid = get_device_id(dev_data->dev); + set_dte_entry(devid, domain); } } @@ -2259,14 +2243,17 @@ free_domains: static void cleanup_domain(struct protection_domain *domain) { + struct iommu_dev_data *dev_data, *next; unsigned long flags; - u16 devid; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) - if (amd_iommu_pd_table[devid] == domain) - clear_dte_entry(devid); + list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { + struct device *dev = dev_data->dev; + + do_detach(dev); + atomic_set(&dev_data->bind, 0); + } write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); } diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index fe1686f6f91b..7ffc39965233 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -164,12 +164,6 @@ u16 *amd_iommu_alias_table; */ struct amd_iommu **amd_iommu_rlookup_table; -/* - * The pd table (protection domain table) is used to find the protection domain - * data structure a device belongs to. Indexed with the PCI device id too. - */ -struct protection_domain **amd_iommu_pd_table; - /* * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap * to know which ones are already in use. @@ -1238,15 +1232,6 @@ static int __init amd_iommu_init(void) if (amd_iommu_rlookup_table == NULL) goto free; - /* - * Protection Domain table - maps devices to protection domains - * This table has the same size as the rlookup_table - */ - amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - get_order(rlookup_table_size)); - if (amd_iommu_pd_table == NULL) - goto free; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( GFP_KERNEL | __GFP_ZERO, get_order(MAX_DOMAIN_ID/8)); @@ -1314,9 +1299,6 @@ free: free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); - free_pages((unsigned long)amd_iommu_pd_table, - get_order(rlookup_table_size)); - free_pages((unsigned long)amd_iommu_rlookup_table, get_order(rlookup_table_size)); -- cgit v1.2.1 From 18ed61da985c57eea3fe8038b13fa2837c9b3c3f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 27 Nov 2009 15:24:44 +0100 Subject: x86: hpet: Make WARN_ON understandable Andrew complained rightly that the WARN_ON in hpet_next_event() is confusing and the code comment not really helpful. Change it to WARN_ONCE and print the reason in clear text. Change the comment to explain what kind of hardware wreckage we deal with. Pointed-out-by: Andrew Morton Signed-off-by: Thomas Gleixner Cc: Venki Pallipadi --- arch/x86/kernel/hpet.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 7f024ff47d1d..ba6e65884603 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -384,11 +384,22 @@ static int hpet_next_event(unsigned long delta, hpet_writel(cnt, HPET_Tn_CMP(timer)); /* - * We need to read back the CMP register to make sure that - * what we wrote hit the chip before we compare it to the - * counter. + * We need to read back the CMP register on certain HPET + * implementations (ATI chipsets) which seem to delay the + * transfer of the compare register into the internal compare + * logic. With small deltas this might actually be too late as + * the counter could already be higher than the compare value + * at that point and we would wait for the next hpet interrupt + * forever. We found out that reading the CMP register back + * forces the transfer so we can rely on the comparison with + * the counter register below. If the read back from the + * compare register does not match the value we programmed + * then we might have a real hardware problem. We can not do + * much about it here, but at least alert the user/admin with + * a prominent warning. */ - WARN_ON_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt); + WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, + KERN_WARNING "hpet: compare register read back failed.\n"); return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } -- cgit v1.2.1 From 9eaa192d8988d621217a9e6071cd403fd6010496 Mon Sep 17 00:00:00 2001 From: "Helight.Xu" Date: Mon, 30 Nov 2009 18:33:51 +0800 Subject: x86: Fix a section mismatch in arch/x86/kernel/setup.c copy_edd() should be __init. warning msg: WARNING: vmlinux.o(.text+0x7759): Section mismatch in reference from the function copy_edd() to the variable .init.data:boot_params The function copy_edd() references the variable __initdata boot_params. This is often because copy_edd lacks a __initdata annotation or the annotation of boot_params is wrong. Signed-off-by: ZhenwenXu LKML-Reference: <4B139F8F.4000907@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e09f0e2c14b5..e020b2d03b4f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -247,7 +247,7 @@ EXPORT_SYMBOL(edd); * from boot_params into a safe place. * */ -static inline void copy_edd(void) +static inline void __init copy_edd(void) { memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, sizeof(edd.mbr_signature)); @@ -256,7 +256,7 @@ static inline void copy_edd(void) edd.edd_info_nr = boot_params.eddbuf_entries; } #else -static inline void copy_edd(void) +static inline void __init copy_edd(void) { } #endif -- cgit v1.2.1 From 1cedae72904b85462082dbcfd5190309ba37f8bd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 2 Dec 2009 07:32:16 +0100 Subject: hw-breakpoints: Keep track of user disabled breakpoints When we disable a breakpoint through dr7, we unregister it right away, making us lose track of its corresponding address register value. It means that the following sequence would be unsupported: - set address in dr0 - enable it through dr7 - disable it through dr7 - enable it through dr7 because we lost the address register value when we disabled the breakpoint. Don't unregister the disabled breakpoints but rather disable them. Reported-by: "K.Prasad" Signed-off-by: Frederic Weisbecker LKML-Reference: <1259735536-9236-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2941b32ea666..04d182a7cfdb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -595,7 +595,7 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) static struct perf_event * ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, - struct task_struct *tsk) + struct task_struct *tsk, int disabled) { int err; int gen_len, gen_type; @@ -616,7 +616,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, attr = bp->attr; attr.bp_len = gen_len; attr.bp_type = gen_type; - attr.disabled = 0; + attr.disabled = disabled; return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); } @@ -655,13 +655,21 @@ restore: */ if (!second_pass) continue; + thread->ptrace_bps[i] = NULL; - unregister_hw_breakpoint(bp); + bp = ptrace_modify_breakpoint(bp, len, type, + tsk, 1); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + thread->ptrace_bps[i] = NULL; + break; + } + thread->ptrace_bps[i] = bp; } continue; } - bp = ptrace_modify_breakpoint(bp, len, type, tsk); + bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); /* Incorrect bp, or we have a bug in bp API */ if (IS_ERR(bp)) { -- cgit v1.2.1 From ca64c47cecd0321b2e0dcbd7aaff44b68ce20654 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 1 Dec 2009 15:31:15 -0800 Subject: x86, io-apic: Move the effort of clearing remoteIRR explicitly before migrating the irq When the level-triggered interrupt is seen as an edge interrupt, we try to clear the remoteIRR explicitly (using either an io-apic eoi register when present or through the idea of changing trigger mode of the io-apic RTE to edge and then back to level). But this explicit try also needs to happen before we try to migrate the irq. Otherwise irq migration attempt will fail anyhow, as it postpones the irq migration to a later attempt when it sees the remoteIRR in the io-apic RTE still set. Signed-off-by: "Maciej W. Rozycki" Reviewed-by: Suresh Siddha Cc: ebiederm@xmission.com Cc: garyhade@us.ibm.com LKML-Reference: <20091201233334.975416130@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 085e60e303cf..b377b973899e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2583,6 +2583,20 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); + /* Tail end of version 0x11 I/O APIC bug workaround */ + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + + if (use_eoi_reg) + eoi_ioapic_irq(desc); + else { + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(cfg); + __unmask_and_level_IO_APIC_irq(cfg); + spin_unlock(&ioapic_lock); + } + } + /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2616,20 +2630,6 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq_desc(desc); } - - /* Tail end of version 0x11 I/O APIC bug workaround */ - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - - if (use_eoi_reg) - eoi_ioapic_irq(desc); - else { - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(cfg); - __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); - } - } } #ifdef CONFIG_INTR_REMAP -- cgit v1.2.1 From c29d9db338db606c3335a03f337e1d4b7f6bb727 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 1 Dec 2009 15:31:16 -0800 Subject: x86, ioapic: Fix the EOI register detection mechanism Maciej W. Rozycki reported: > 82093AA I/O APIC has its version set to 0x11 and it > does not support the EOI register. Similarly I/O APICs > integrated into the 82379AB south bridge and the 82374EB/SB > EISA component. IO-APIC versions below 0x20 don't support EOI register. Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic version as 0x2. This is an error with documentation and these ICH chips use io-apic's of version 0x20 and indeed has a working EOI register for the io-apic. Fix the EOI register detection mechanism to check for version 0x20 and beyond. And also, a platform can potentially have io-apic's with different versions. Make the EOI register check per io-apic. Reported-by: Maciej W. Rozycki Signed-off-by: Suresh Siddha Cc: ebiederm@xmission.com Cc: garyhade@us.ibm.com LKML-Reference: <20091201233335.065361533@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 115 ++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 54 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b377b973899e..78960a3b0ed0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -539,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, add_pin_to_irq_node(cfg, node, newapic, newpin); } +static void __io_apic_modify_irq(struct irq_pin_list *entry, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + unsigned int reg, pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); +} + static void io_apic_modify_irq(struct irq_cfg *cfg, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { - int pin; struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); - } + for_each_irq_pin(entry, cfg->irq_2_pin) + __io_apic_modify_irq(entry, mask_and, mask_or, final); +} + +static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED, NULL); +} + +static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry) +{ + __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) @@ -579,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED, NULL); -} - -static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER, NULL); -} - static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { struct irq_cfg *cfg = desc->chip_data; @@ -2492,17 +2498,42 @@ static void ack_apic_edge(unsigned int irq) atomic_t irq_mis_count; -static int use_eoi_reg __read_mostly; - +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. +*/ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { struct irq_pin_list *entry; for_each_irq_pin(entry, cfg->irq_2_pin) { - if (irq_remapped(irq)) - io_apic_eoi(entry->apic, entry->pin); - else - io_apic_eoi(entry->apic, cfg->vector); + if (mp_ioapics[entry->apic].apicver >= 0x20) { + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + if (irq_remapped(irq)) + io_apic_eoi(entry->apic, entry->pin); + else + io_apic_eoi(entry->apic, cfg->vector); + } else { + __mask_and_edge_IO_APIC_irq(entry); + __unmask_and_level_IO_APIC_irq(entry); + } } } @@ -2520,23 +2551,6 @@ static void eoi_ioapic_irq(struct irq_desc *desc) spin_unlock_irqrestore(&ioapic_lock, flags); } -static int ioapic_supports_eoi(void) -{ - struct pci_dev *root; - - root = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0)); - if (root && root->vendor == PCI_VENDOR_ID_INTEL && - mp_ioapics[0].apicver >= 0x2) { - use_eoi_reg = 1; - printk(KERN_INFO "IO-APIC supports EOI register\n"); - } else - printk(KERN_INFO "IO-APIC doesn't support EOI\n"); - - return 0; -} - -fs_initcall(ioapic_supports_eoi); - static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2587,14 +2601,7 @@ static void ack_apic_level(unsigned int irq) if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); - if (use_eoi_reg) - eoi_ioapic_irq(desc); - else { - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(cfg); - __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); - } + eoi_ioapic_irq(desc); } /* Now we can move and renable the irq */ -- cgit v1.2.1 From 1c83995b6c7c6bb795bce80f75fbffb15f78db2d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 1 Dec 2009 15:31:17 -0800 Subject: x86, ioapic: Document another case when level irq is seen as an edge In the case when cpu goes offline, fixup_irqs() will forward any unhandled interrupt on the offlined cpu to the new cpu destination that is handling the corresponding interrupt. This interrupt forwarding is done via IPI's. Hence, in this case also level-triggered io-apic interrupt will be seen as an edge interrupt in the cpu's APIC IRR. Document this scenario in the code which handles this case by doing an explicit EOI to the io-apic to clear remote IRR of the io-apic RTE. Requested-by: Maciej W. Rozycki Signed-off-by: Suresh Siddha Cc: Maciej W. Rozycki Cc: ebiederm@xmission.com Cc: garyhade@us.ibm.com LKML-Reference: <20091201233335.143970505@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 78960a3b0ed0..c0b4468683f9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2586,6 +2586,19 @@ static void ack_apic_level(unsigned int irq) * level-triggered interrupt. We mask the source for the time of the * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro + * + * Also in the case when cpu goes offline, fixup_irqs() will forward + * any unhandled interrupt on the offlined cpu to the new cpu + * destination that is handling the corresponding interrupt. This + * interrupt forwarding is done via IPI's. Hence, in this case also + * level-triggered io-apic interrupt will be seen as an edge + * interrupt in the IRR. And we can't rely on the cpu's EOI + * to be broadcasted to the IO-APIC's which will clear the remoteIRR + * corresponding to the level-triggered interrupt. Hence on IO-APIC's + * supporting EOI register, we do an explicit EOI to clear the + * remote IRR and on IO-APIC's which don't have an EOI register, + * we use the above logic (mask+edge followed by unmask+level) from + * Manfred Spraul to clear the remote IRR. */ cfg = desc->chip_data; i = cfg->vector; @@ -2597,7 +2610,13 @@ static void ack_apic_level(unsigned int irq) */ ack_APIC_irq(); - /* Tail end of version 0x11 I/O APIC bug workaround */ + /* + * Tail end of clearing remote IRR bit (either by delivering the EOI + * message via io-apic EOI register write or simulating it using + * mask+edge followed by unnask+level logic) manually when the + * level triggered interrupt is seen as the edge triggered interrupt + * at the cpu. + */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); -- cgit v1.2.1 From 6d20792e85187b27ae3d1b76678a2dd7025e8bc2 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 1 Dec 2009 15:31:18 -0800 Subject: x86: Remove unnecessary mdelay() from cpu_disable_common() fixup_irqs() already has a mdelay(). Remove the extra and unnecessary mdelay() from cpu_disable_common(). Signed-off-by: Suresh Siddha Cc: Maciej W. Rozycki Cc: ebiederm@xmission.com Cc: garyhade@us.ibm.com LKML-Reference: <20091201233335.232177348@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 565ebc65920e..324f2a44c221 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1250,16 +1250,7 @@ static void __ref remove_cpu_from_maps(int cpu) void cpu_disable_common(void) { int cpu = smp_processor_id(); - /* - * HACK: - * Allow any queued timer interrupts to get serviced - * This is only a temporary solution until we cleanup - * fixup_irqs as we do for IA64. - */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ -- cgit v1.2.1 From fe5ed91ddce85a0ed0e4f92c10b099873ef62167 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 3 Dec 2009 11:33:08 +0900 Subject: x86, mce: don't restart timer if disabled Even it is in error path unlikely taken, add_timer_on() at CPU_DOWN_FAILED* needs to be skipped if mce_timer is disabled. Signed-off-by: Hidetoshi Seto Cc: Andi Kleen Cc: Huang Ying Cc: Jan Beulich Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 721a77ca8115..4825a3d6eb40 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1991,9 +1991,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies(jiffies + + if (!mce_ignore_ce && check_interval) { + t->expires = round_jiffies(jiffies + __get_cpu_var(mce_next_interval)); - add_timer_on(t, cpu); + add_timer_on(t, cpu); + } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; case CPU_POST_DEAD: -- cgit v1.2.1 From 4528752f49c1f4025473d12bc5fa9181085c3f22 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 2 Dec 2009 15:05:56 -0800 Subject: x86, Calgary IOMMU quirk: Find nearest matching Calgary while walking up the PCI tree On a multi-node x3950M2 system, there's a slight oddity in the PCI device tree for all secondary nodes: 30:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev e1) \-33:00.0 PCI bridge: IBM CalIOC2 PCI-E Root Port (rev 01) \-34:00.0 RAID bus controller: LSI Logic / Symbios Logic MegaRAID SAS 1078 (rev 04) ...as compared to the primary node: 00:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev e1) \-01:00.0 VGA compatible controller: ATI Technologies Inc ES1000 (rev 02) 03:00.0 PCI bridge: IBM CalIOC2 PCI-E Root Port (rev 01) \-04:00.0 RAID bus controller: LSI Logic / Symbios Logic MegaRAID SAS 1078 (rev 04) In both nodes, the LSI RAID controller hangs off a CalIOC2 device, but on the secondary nodes, the BIOS hides the VGA device and substitutes the device tree ending with the disk controller. It would seem that Calgary devices don't necessarily appear at the top of the PCI tree, which means that the current code to find the Calgary IOMMU that goes with a particular device is buggy. Rather than walk all the way to the top of the PCI device tree and try to match bus number with Calgary descriptor, the code needs to examine each parent of the particular device; if it encounters a Calgary with a matching bus number, simply use that. Otherwise, we BUG() when the bus number of the Calgary doesn't match the bus number of whatever's at the top of the device tree. Extra note: This patch appears to work correctly for the x3950 that came before the x3950 M2. Signed-off-by: Darrick J. Wong Acked-by: Muli Ben-Yehuda Cc: FUJITA Tomonori Cc: Joerg Roedel Cc: Yinghai Lu Cc: Jon D. Mason Cc: Corinna Schultz Cc: LKML-Reference: <20091202230556.GG10295@tux1.beaverton.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 849a0995d970..c563e4c8ff39 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -316,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev) pdev = to_pci_dev(dev); + /* search up the device tree for an iommu */ pbus = pdev->bus; - - /* is the device behind a bridge? Look for the root bus */ - while (pbus->parent) + do { + tbl = pci_iommu(pbus); + if (tbl && tbl->it_busno == pbus->number) + break; + tbl = NULL; pbus = pbus->parent; - - tbl = pci_iommu(pbus); + } while (pbus); BUG_ON(tbl && (tbl->it_busno != pbus->number)); -- cgit v1.2.1 From 57fea8f7ab67ef42b7f84999e49e47f8717a2d5b Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 3 Dec 2009 19:06:40 +0800 Subject: x86/reboot: Add pci_dev_put in reboot_fixup_32.c for consistency pci_get_device will increase the ref count of found device. Although we're going to reset soon, we should use pci_dev_put to decrease the ref count for consistency. Signed-off-by: Xiaotian Feng Acked-by: H. Peter Anvin Cc: Yinghai Lu LKML-Reference: <1259838400-23833-1-git-send-email-dfeng@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot_fixups_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index 61a837743fe5..201eab63b05f 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -80,6 +80,7 @@ void mach_reboot_fixups(void) continue; cur->reboot_fixup(dev); + pci_dev_put(dev); } } -- cgit v1.2.1 From 7d1849aff6687a135a8da3a75e32a00e3137a5e2 Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Thu, 3 Dec 2009 15:52:44 +0100 Subject: x86, apic: Enable lapic nmi watchdog on AMD Family 11h The x86 lapic nmi watchdog does not recognize AMD Family 11h, resulting in: NMI watchdog: CPU not supported As far as I can see from available documentation (the BKDM), family 11h looks identical to family 10h as far as the PMU is concerned. Extending the check to accept family 11h results in: Testing NMI watchdog ... OK. I've been running with this change on a Turion X2 Ultra ZM-82 laptop for a couple of weeks now without problems. Signed-off-by: Mikael Pettersson Cc: Andreas Herrmann Cc: Joerg Roedel Cc: LKML-Reference: <19223.53436.931768.278021@pilspetsen.it.uu.se> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index fab786f60ed6..898df9719afb 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void) switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16) + boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17) return; wd_ops = &k7_wd_ops; break; -- cgit v1.2.1 From af901ca181d92aac3a7dc265144a9081a86d8f39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= Date: Sat, 14 Nov 2009 13:09:05 -0200 Subject: tree-wide: fix assorted typos all over the place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That is "success", "unknown", "through", "performance", "[re|un]mapping" , "access", "default", "reasonable", "[con]currently", "temperature" , "channel", "[un]used", "application", "example","hierarchy", "therefore" , "[over|under]flow", "contiguous", "threshold", "enough" and others. Signed-off-by: André Goddard Rosa Signed-off-by: Jiri Kosina --- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/amd_iommu.c | 4 ++-- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/kprobes.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 67e929b89875..1c2c4838d35c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1122,7 +1122,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) if (!acpi_sci_override_gsi) acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); - /* Fill in identity legacy mapings where no override */ + /* Fill in identity legacy mappings where no override */ mp_config_acpi_legacy_irqs(); count = diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0285521e0a99..42ac5e000995 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1637,7 +1637,7 @@ retry: goto out; /* - * aperture was sucessfully enlarged by 128 MB, try + * aperture was successfully enlarged by 128 MB, try * allocation again */ goto retry; @@ -2396,7 +2396,7 @@ int __init amd_iommu_init_passthrough(void) struct pci_dev *dev = NULL; u16 devid, devid2; - /* allocate passthroug domain */ + /* allocate passthrough domain */ pt_domain = protection_domain_alloc(); if (!pt_domain) return -ENOMEM; diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b5801c311846..35be5802ac1e 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1229,7 +1229,7 @@ x86_perf_event_set_period(struct perf_event *event, return 0; /* - * If we are way outside a reasoable range then just skip forward: + * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { left = period; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 7b5169d2b000..7d377379fa4a 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -514,7 +514,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they - * remain disabled thorough out this function. + * remain disabled throughout this function. */ static int __kprobes kprobe_handler(struct pt_regs *regs) { @@ -851,7 +851,7 @@ no_change: /* * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled thoroughout this function. + * remain disabled throughout this function. */ static int __kprobes post_kprobe_handler(struct pt_regs *regs) { -- cgit v1.2.1 From 5d990b627537e59a3a2f039ff588a4750e9c1a6a Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Fri, 4 Dec 2009 12:15:21 -0800 Subject: PCI: add pci_request_acs Commit ae21ee65e8bc228416bbcc8a1da01c56a847a60c "PCI: acs p2p upsteram forwarding enabling" doesn't actually enable ACS. Add a function to pci core to allow an IOMMU to request that ACS be enabled. The existing mechanism of using iommu_found() in the pci core to know when ACS should be enabled doesn't actually work due to initialization order; iommu has only been detected not initialized. Have Intel and AMD IOMMUs request ACS, and Xen does as well during early init of dom0. Cc: Allen Kay Cc: David Woodhouse Cc: Jeremy Fitzhardinge Cc: Joerg Roedel Signed-off-by: Chris Wright Signed-off-by: Jesse Barnes --- arch/x86/kernel/amd_iommu_init.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index b4b61d462dcc..e60530a5f524 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1330,6 +1330,8 @@ void __init amd_iommu_detect(void) gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; #endif + /* Make sure ACS will be enabled */ + pci_request_acs(); } } -- cgit v1.2.1 From 4832ddda2ec4df96ea1eed334ae2dbd65fc1f541 Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Fri, 4 Dec 2009 15:42:22 -0800 Subject: x86: ASUS P4S800 reboot=bios quirk Bug reporter noted their system with an ASUS P4S800 motherboard would hang when rebooting unless reboot=b was specified. Their dmidecode didn't contain descriptive System Information for Manufacturer or Product Name, so I used their Base Board Information to create a reboot quirk patch. The bug reporter confirmed this patch resolves the reboot hang. Handle 0x0001, DMI type 1, 25 bytes System Information Manufacturer: System Manufacturer Product Name: System Name Version: System Version Serial Number: SYS-1234567890 UUID: E0BFCD8B-7948-D911-A953-E486B4EEB67F Wake-up Type: Power Switch Handle 0x0002, DMI type 2, 8 bytes Base Board Information Manufacturer: ASUSTeK Computer INC. Product Name: P4S800 Version: REV 1.xx Serial Number: xxxxxxxxxxx BugLink: http://bugs.launchpad.net/bugs/366682 ASUS P4S800 will hang when rebooting unless reboot=b is specified. Add a quirk to reboot through the bios. Signed-off-by: Leann Ogasawara LKML-Reference: <1259972107.4629.275.camel@emiko> Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f93078746e00..6caf26010970 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -259,6 +259,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), }, }, + { /* Handle problems with rebooting on ASUS P4S800 */ + .callback = set_bios_reboot, + .ident = "ASUS P4S800", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + }, + }, { } }; -- cgit v1.2.1 From 2f0993e0fb663c49e4d1e02654f6203246be4817 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 07:06:10 +0100 Subject: hw-breakpoints: Drop callback and task parameters from modify helper Drop the callback and task parameters from modify_user_hw_breakpoint(). For now we have no user that need to modify a breakpoint to the point of changing its handler or its task context. Signed-off-by: Frederic Weisbecker Cc: "K. Prasad" --- arch/x86/kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 04d182a7cfdb..dbb395572ae2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -618,7 +618,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, attr.bp_type = gen_type; attr.disabled = disabled; - return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); + return modify_user_hw_breakpoint(bp, &attr); } /* @@ -740,7 +740,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr = bp->attr; attr.bp_addr = addr; - bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); + bp = modify_user_hw_breakpoint(bp, &attr); } /* * CHECKME: the previous code returned -EIO if the addr wasn't a -- cgit v1.2.1 From b326e9560a28fc3e950637ef51847ed8f05c1335 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 09:44:31 +0100 Subject: hw-breakpoints: Use overflow handler instead of the event callback struct perf_event::event callback was called when a breakpoint triggers. But this is a rather opaque callback, pretty tied-only to the breakpoint API and not really integrated into perf as it triggers even when we don't overflow. We prefer to use overflow_handler() as it fits into the perf events rules, being called only when we overflow. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" --- arch/x86/kernel/hw_breakpoint.c | 5 ++--- arch/x86/kernel/ptrace.c | 9 ++++++--- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index d42f65ac4927..05d5fec64a94 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -362,8 +362,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - if (bp->callback) - ret = arch_store_info(bp); + ret = arch_store_info(bp); if (ret < 0) return ret; @@ -519,7 +518,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) break; } - (bp->callback)(bp, args->regs); + perf_bp_event(bp, args->regs); rcu_read_unlock(); } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index dbb395572ae2..b361d28061d0 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -555,7 +555,9 @@ static int genregs_set(struct task_struct *target, return ret; } -static void ptrace_triggered(struct perf_event *bp, void *data) +static void ptrace_triggered(struct perf_event *bp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { int i; struct thread_struct *thread = &(current->thread); @@ -599,7 +601,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, { int err; int gen_len, gen_type; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; /* * We shoud have at least an inactive breakpoint at this @@ -721,9 +723,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, { struct perf_event *bp; struct thread_struct *t = &tsk->thread; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; if (!t->ptrace_bps[nr]) { + hw_breakpoint_init(&attr); /* * Put stub len and type to register (reserve) an inactive but * correct bp -- cgit v1.2.1 From 7f33f9c5cc3c99aeaf4d266a7ed502b828115a53 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 12:01:17 +0100 Subject: x86/perf: Exclude the debug stack from the callchains Dumping the callchains from breakpoint events with perf gives strange results: 3.75% perf [kernel] [k] _raw_read_unlock | --- _raw_read_unlock perf_callchain perf_prepare_sample __perf_event_overflow perf_swevent_overflow perf_swevent_add perf_bp_event hw_breakpoint_exceptions_notify notifier_call_chain __atomic_notifier_call_chain atomic_notifier_call_chain notify_die do_debug debug munmap We are infected with all the debug stack. Like the nmi stack, the debug stack is undesired as it is part of the profiling path, not helpful for the user. Ignore it. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" --- arch/x86/kernel/cpu/perf_event.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c1bbed1021d9..d35f26076ae5 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2287,7 +2287,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_nmi_frame); +static DEFINE_PER_CPU(int, in_ignored_frame); static void @@ -2303,8 +2303,9 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - per_cpu(in_nmi_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name); + per_cpu(in_ignored_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name) || + x86_is_stack_id(DEBUG_STACK, name); return 0; } @@ -2313,7 +2314,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - if (per_cpu(in_nmi_frame, smp_processor_id())) + if (per_cpu(in_ignored_frame, smp_processor_id())) return; if (reliable) -- cgit v1.2.1 From b625b3b3b740e177a1148594cd3ad5ff52f35315 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 6 Dec 2009 00:52:56 +0100 Subject: x86: Fixup wrong debug exception frame link in stacktraces While dumping a stacktrace, the end of the exception stack won't link the frame pointer to the previous stack. The interrupted stack will then be considered as unreliable and ignored by perf, as the frame pointer is unreliable itself. This happens because we overwrite the frame pointer that links to the interrupted frame with the address of the exception stack. This is done in order to reserve space inside. But rbp has been chosen here only because it is not a scratch register, so that the address of the exception stack remains in rbp after calling do_debug(), we can then release the exception stack space without the need to retrieve its address again. But we can pick another non-scratch register to do that, so that we preserve the link to the interrupted stack frame in the stacktraces. Just randomly choose r12. Every registers are saved just before and restored just after calling do_debug(). And r12 is not used in the middle, which makes it a perfect candidate. Example: perf record -g -a -c 1 -f -e mem:$(tasklist_lock_addr):rw Before: 44.18% [k] _raw_read_lock | | --- |--6.31%-- waitid | |--4.26%-- writev | |--3.63%-- __select | |--3.15%-- __waitpid | | | |--28.57%-- 0x8b52e00000139f | | | |--28.57%-- 0x8b52e0000013c6 | | | |--14.29%-- 0x7fde786dc000 | | | |--14.29%-- 0x62696c2f7273752f | | | --14.29%-- 0x1ea9df800000000 | |--3.00%-- __poll After: 43.94% [k] _raw_read_lock | --- _read_lock | |--60.53%-- send_sigio | __kill_fasync | kill_fasync | evdev_pass_event | evdev_event | input_pass_event | input_handle_event | input_event | synaptics_process_byte | psmouse_handle_byte | psmouse_interrupt | serio_interrupt | i8042_interrupt | handle_IRQ_event | handle_edge_irq | handle_irq | __irqentry_text_start | ret_from_intr | | | |--30.43%-- __select | | | |--17.39%-- 0x454f15 | | | |--13.04%-- __read | | | |--13.04%-- vread_hpet | | | |--13.04%-- _xcb_lock_io | | | --13.04%-- 0x7f630878ce87 Note: it does not only affect perf events but also other stacktraces in x86-64. They were considered as unreliable once we quit the debug stack frame. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/kernel/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 722df1b1152d..0f08a0cea3e0 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1076,10 +1076,10 @@ ENTRY(\sym) TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - PER_CPU(init_tss, %rbp) - subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + PER_CPU(init_tss, %r12) + subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) call \do_sym - addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) -- cgit v1.2.1 From af2d8289f57e427836be482c6f72cca674028121 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 6 Dec 2009 05:34:27 +0100 Subject: x86: Fixup wrong irq frame link in stacktraces When we enter in irq, two things can happen to preserve the link to the previous frame pointer: - If we were in an irq already, we don't switch to the irq stack as we are inside. We just need to save the previous frame pointer and to link the new one to the previous. - Otherwise we need another level of indirection. We enter the irq with the previous stack. We save the previous bp inside and make bp pointing to its saved address. Then we switch to the irq stack and push bp another time but to the new stack. This makes two levels to dereference instead of one. In the second case, the current stacktrace code omits the second level and loses the frame pointer accuracy. The stack that follows will then be considered as unreliable. Handling that makes the perf callchain happier. Before: 43.94% [k] _raw_read_lock | --- _read_lock | |--60.53%-- send_sigio | __kill_fasync | kill_fasync | evdev_pass_event | evdev_event | input_pass_event | input_handle_event | input_event | synaptics_process_byte | psmouse_handle_byte | psmouse_interrupt | serio_interrupt | i8042_interrupt | handle_IRQ_event | handle_edge_irq | handle_irq | __irqentry_text_start | ret_from_intr | | | |--30.43%-- __select | | | |--17.39%-- 0x454f15 | | | |--13.04%-- __read | | | |--13.04%-- vread_hpet | | | |--13.04%-- _xcb_lock_io | | | --13.04%-- 0x7f630878ce8 After: 50.00% [k] _raw_read_lock | --- _read_lock | |--98.97%-- send_sigio | __kill_fasync | kill_fasync | evdev_pass_event | evdev_event | input_pass_event | input_handle_event | input_event | | | |--96.88%-- synaptics_process_byte | | psmouse_handle_byte | | psmouse_interrupt | | serio_interrupt | | i8042_interrupt | | handle_IRQ_event | | handle_edge_irq | | handle_irq | | __irqentry_text_start | | ret_from_intr | | | | | |--39.78%-- __const_udelay | | | | | | | |--91.89%-- ath5k_hw_register_timeout | | | | ath5k_hw_noise_floor_calibration | | | | ath5k_hw_reset | | | | ath5k_reset | | | | ath5k_config | | | | ieee80211_hw_config | | | | | | | | | |--88.24%-- ieee80211_scan_work | | | | | worker_thread | | | | | kthread | | | | | child_rip | | | | | | | | | --11.76%-- ieee80211_scan_completed | | | | ieee80211_scan_work | | | | worker_thread | | | | kthread | | | | child_rip | | | | | | | --8.11%-- ath5k_hw_noise_floor_calibration | | | ath5k_hw_reset | | | ath5k_reset | | | ath5k_config Note: This does not only affect perf events but also x86-64 stacktraces. They were considered as unreliable once we quit the irq stack frame. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/kernel/dumpstack_64.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a071e6be177e..004b8aa6a35f 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -101,6 +101,35 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } +static inline int +in_irq_stack(unsigned long *stack, unsigned long *irq_stack, + unsigned long *irq_stack_end) +{ + return (stack >= irq_stack && stack < irq_stack_end); +} + +/* + * We are returning from the irq stack and go to the previous one. + * If the previous stack is also in the irq stack, then bp in the first + * frame of the irq stack points to the previous, interrupted one. + * Otherwise we have another level of indirection: We first save + * the bp of the previous stack, then we switch the stack to the irq one + * and save a new bp that links to the previous one. + * (See save_args()) + */ +static inline unsigned long +fixup_bp_irq_link(unsigned long bp, unsigned long *stack, + unsigned long *irq_stack, unsigned long *irq_stack_end) +{ +#ifdef CONFIG_FRAME_POINTER + struct stack_frame *frame = (struct stack_frame *)bp; + + if (!in_irq_stack(stack, irq_stack, irq_stack_end)) + return (unsigned long)frame->next_frame; +#endif + return bp; +} + /* * x86-64 can have up to three kernel stacks: * process stack @@ -173,7 +202,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, irq_stack = irq_stack_end - (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); - if (stack >= irq_stack && stack < irq_stack_end) { + if (in_irq_stack(stack, irq_stack, irq_stack_end)) { if (ops->stack(data, "IRQ") < 0) break; bp = print_context_stack(tinfo, stack, bp, @@ -184,6 +213,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * pointer (index -1 to end) in the IRQ stack: */ stack = (unsigned long *) (irq_stack_end[-1]); + bp = fixup_bp_irq_link(bp, stack, irq_stack, + irq_stack_end); irq_stack_end = NULL; ops->stack(data, "EOI"); continue; -- cgit v1.2.1 From a946d8f11f0da9cfc714248036fcfd3a794d1e27 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Dec 2009 12:59:46 +0100 Subject: x86: Fix bogus warning in apic_noop.apic_write() apic_noop is used to provide dummy apic functions. It's installed when the CPU has no APIC or when the APIC is disabled on the kernel command line. The apic_noop implementation of apic_write() warns when the CPU has an APIC or when the APIC is not disabled. That's bogus. The warning should only happen when the CPU has an APIC _AND_ the APIC is not disabled. apic_noop.apic_read() has the correct check. Signed-off-by: Thomas Gleixner Cc: Cyrill Gorcunov Cc: # in <= .32 this typo resides in native_apic_write_dummy() LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_noop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index d9acc3bee0f4..e31b9ffe25f5 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -127,7 +127,7 @@ static u32 noop_apic_read(u32 reg) static void noop_apic_write(u32 reg, u32 v) { - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + WARN_ON_ONCE(cpu_has_apic && !disable_apic); } struct apic apic_noop = { -- cgit v1.2.1 From bc09effabf0c5c6c7021e5ef9af15a23579b32a8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 8 Dec 2009 11:21:37 +0900 Subject: x86/mce: Set up timer unconditionally mce_timer must be passed to setup_timer() in all cases, no matter whether it is going to be actually used. Otherwise, when the CPU gets brought down, its call to del_timer_sync() will never return, as the timer won't have a base associated, and hence lock_timer_base() will loop infinitely. Signed-off-by: Jan Beulich Signed-off-by: Hidetoshi Seto Cc: LKML-Reference: <4B1DB831.2030801@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d7ebf25d10ed..a96e5cd256a9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1388,13 +1388,14 @@ static void __mcheck_cpu_init_timer(void) struct timer_list *t = &__get_cpu_var(mce_timer); int *n = &__get_cpu_var(mce_next_interval); + setup_timer(t, mce_start_timer, smp_processor_id()); + if (mce_ignore_ce) return; *n = check_interval * HZ; if (!*n) return; - setup_timer(t, mce_start_timer, smp_processor_id()); t->expires = round_jiffies(jiffies + *n); add_timer_on(t, smp_processor_id()); } -- cgit v1.2.1 From 5c0e9f28da84c68ce0ae68b7a75faaf862e156e2 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 8 Dec 2009 16:52:44 +0900 Subject: x86, mce: fix confusion between bank attributes and mce attributes Commit cebe182033f156b430952370fb0f9dbe6e89b081 had an unnecessary, wrong change: &mce_banks[i].attr is equivalent to the former bank_attrs[i], not to mce_attrs[i]. Signed-off-by: Hidetoshi Seto Acked-by: Andi Kleen LKML-Reference: <4B1E05CC.4040703@jp.fujitsu.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a96e5cd256a9..a8aacd4b513c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1929,7 +1929,7 @@ error2: sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); sysdev_unregister(&per_cpu(mce_dev, cpu)); -- cgit v1.2.1 From f58e1f53de52a70391b6478617311207c7203363 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 8 Dec 2009 22:30:50 -0800 Subject: arch/x86/kernel/microcode*: Use pr_fmt() and remove duplicated KERN_ERR prefix - Use #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Remove "microcode: " prefix from each pr_ - Fix duplicated KERN_ERR prefix - Coalesce pr_ format strings - Add a space after an exclamation point No other change in output. Signed-off-by: Joe Perches Cc: Andy Whitcroft Cc: Andreas Herrmann LKML-Reference: <1260340250.27677.191.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 40 ++++++++++++++++----------------- arch/x86/kernel/microcode_core.c | 26 ++++++++++++---------- arch/x86/kernel/microcode_intel.c | 47 +++++++++++++++++---------------------- 3 files changed, 53 insertions(+), 60 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 63123d902103..37542b67c57e 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,6 +13,9 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -81,7 +84,7 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) memset(csig, 0, sizeof(*csig)); rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - pr_info("microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; } @@ -111,8 +114,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - pr_err(KERN_ERR "microcode: CPU%d: loading of chipset " - "specific code not yet supported\n", cpu); + pr_err("CPU%d: loading of chipset specific code not yet supported\n", + cpu); return 0; } @@ -141,12 +144,12 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - pr_err("microcode: CPU%d: update failed " - "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); + pr_err("CPU%d: update failed (for patch_level=0x%x)\n", + cpu, mc_amd->hdr.patch_id); return -1; } - pr_info("microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); + pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; @@ -169,15 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) return NULL; if (section_hdr[0] != UCODE_UCODE_TYPE) { - pr_err("microcode: error: invalid type field in " - "container file section header\n"); + pr_err("error: invalid type field in container file section header\n"); return NULL; } total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); if (total_size > size || total_size > UCODE_MAX_SIZE) { - pr_err("microcode: error: size mismatch\n"); + pr_err("error: size mismatch\n"); return NULL; } @@ -206,14 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf) size = buf_pos[2]; if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - pr_err("microcode: error: invalid type field in " - "container file section header\n"); + pr_err("error: invalid type field in container file section header\n"); return 0; } equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); if (!equiv_cpu_table) { - pr_err("microcode: failed to allocate equivalent CPU table\n"); + pr_err("failed to allocate equivalent CPU table\n"); return 0; } @@ -246,7 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { - pr_err("microcode: failed to create equivalent cpu table\n"); + pr_err("failed to create equivalent cpu table\n"); return UCODE_ERROR; } @@ -277,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) if (!leftover) { vfree(uci->mc); uci->mc = new_mc; - pr_debug("microcode: CPU%d found a matching microcode " - "update with version 0x%x (current=0x%x)\n", + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); } else { vfree(new_mc); @@ -300,7 +300,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) return UCODE_NFOUND; if (*(u32 *)firmware->data != UCODE_MAGIC) { - pr_err("microcode: invalid UCODE_MAGIC (0x%08x)\n", + pr_err("invalid UCODE_MAGIC (0x%08x)\n", *(u32 *)firmware->data); return UCODE_ERROR; } @@ -313,8 +313,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { - pr_info("microcode: AMD microcode update via " - "/dev/cpu/microcode not supported\n"); + pr_info("AMD microcode update via /dev/cpu/microcode not supported\n"); return UCODE_ERROR; } @@ -334,14 +333,13 @@ void init_microcode_amd(struct device *device) WARN_ON(c->x86_vendor != X86_VENDOR_AMD); if (c->x86 < 0x10) { - pr_warning("microcode: AMD CPU family 0x%x not supported\n", - c->x86); + pr_warning("AMD CPU family 0x%x not supported\n", c->x86); return; } supported_cpu = 1; if (request_firmware(&firmware, fw_name, device)) - pr_err("microcode: failed to load file %s\n", fw_name); + pr_err("failed to load file %s\n", fw_name); } void fini_microcode_amd(void) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index e68aae397869..844c02c65fcb 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -70,6 +70,9 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -209,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > totalram_pages) { - pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); + pr_err("too much data (max %ld pages)\n", totalram_pages); return ret; } @@ -244,7 +247,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); + pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -359,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu) if (!uci->mc) return UCODE_NFOUND; - pr_debug("microcode: CPU%d updated upon resume\n", cpu); + pr_debug("CPU%d updated upon resume\n", cpu); apply_microcode_on_target(cpu); return UCODE_OK; @@ -379,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu) ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); if (ustate == UCODE_OK) { - pr_debug("microcode: CPU%d updated upon init\n", cpu); + pr_debug("CPU%d updated upon init\n", cpu); apply_microcode_on_target(cpu); } @@ -406,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d added\n", cpu); + pr_debug("CPU%d added\n", cpu); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) @@ -425,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d removed\n", cpu); + pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); return 0; @@ -473,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - pr_debug("microcode: CPU%d added\n", cpu); + pr_debug("CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - pr_err("microcode: Failed to create group for CPU%d\n", cpu); + pr_err("Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - pr_debug("microcode: CPU%d removed\n", cpu); + pr_debug("CPU%d removed\n", cpu); break; case CPU_DEAD: case CPU_UP_CANCELED_FROZEN: @@ -507,7 +510,7 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - pr_err("microcode: no support for this CPU vendor\n"); + pr_err("no support for this CPU vendor\n"); return -ENODEV; } @@ -541,8 +544,7 @@ static int __init microcode_init(void) register_hotcpu_notifier(&mc_cpu_notifier); pr_info("Microcode Update Driver: v" MICROCODE_VERSION - " ," - " Peter Oruba\n"); + " , Peter Oruba\n"); return 0; } diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 0d334ddd0a96..ebd193e476ca 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,6 +70,9 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || cpu_has(c, X86_FEATURE_IA64)) { - printk(KERN_ERR "microcode: CPU%d not a capable Intel " - "processor\n", cpu_num); + pr_err("CPU%d not a capable Intel processor\n", cpu_num); return -1; } @@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", - cpu_num, csig->sig, csig->pf, csig->rev); + pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc) data_size = get_datasize(mc_header); if (data_size + MC_HEADER_SIZE > total_size) { - printk(KERN_ERR "microcode: error! " - "Bad data size in microcode data file\n"); + pr_err("error! Bad data size in microcode data file\n"); return -EINVAL; } if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - printk(KERN_ERR "microcode: error! " - "Unknown microcode update format\n"); + pr_err("error! Unknown microcode update format\n"); return -EINVAL; } ext_table_size = total_size - (MC_HEADER_SIZE + data_size); if (ext_table_size) { if ((ext_table_size < EXT_HEADER_SIZE) || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! " - "Small exttable size in microcode data file\n"); + pr_err("error! Small exttable size in microcode data file\n"); return -EINVAL; } ext_header = mc + MC_HEADER_SIZE + data_size; if (ext_table_size != exttable_size(ext_header)) { - printk(KERN_ERR "microcode: error! " - "Bad exttable size in microcode data file\n"); + pr_err("error! Bad exttable size in microcode data file\n"); return -EFAULT; } ext_sigcount = ext_header->count; @@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc) while (i--) ext_table_sum += ext_tablep[i]; if (ext_table_sum) { - printk(KERN_WARNING "microcode: aborting, " - "bad extended signature table checksum\n"); + pr_warning("aborting, bad extended signature table checksum\n"); return -EINVAL; } } @@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc) while (i--) orig_sum += ((int *)mc)[i]; if (orig_sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); + pr_err("aborting, bad checksum\n"); return -EINVAL; } if (!ext_table_size) @@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc) - (mc_header->sig + mc_header->pf + mc_header->cksum) + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); if (sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); + pr_err("aborting, bad checksum\n"); return -EINVAL; } } @@ -327,13 +324,11 @@ static int apply_microcode(int cpu) rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update " - "to revision 0x%x failed\n", - cpu_num, mc_intel->hdr.rev); + pr_err("CPU%d update to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); return -1; } - printk(KERN_INFO "microcode: CPU%d updated to revision " - "0x%x, date = %04x-%02x-%02x \n", + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, @@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, mc_size = get_totalsize(&mc_header); if (!mc_size || mc_size > leftover) { - printk(KERN_ERR "microcode: error!" - "Bad data in microcode data file\n"); + pr_err("error! Bad data in microcode data file\n"); break; } @@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); out: return state; } @@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) c->x86, c->x86_model, c->x86_mask); if (request_firmware(&firmware, name, device)) { - pr_debug("microcode: data file %s load failed\n", name); + pr_debug("data file %s load failed\n", name); return UCODE_NFOUND; } -- cgit v1.2.1 From 44234adcdce38f83c56e05f808ce656175b4beeb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 9 Dec 2009 09:25:48 +0100 Subject: hw-breakpoints: Modify breakpoints without unregistering them Currently, when ptrace needs to modify a breakpoint, like disabling it, changing its address, type or len, it calls modify_user_hw_breakpoint(). This latter will perform the heavy and racy task of unregistering the old breakpoint and registering a new one. This is racy as someone else might steal the reserved breakpoint slot under us, which is undesired as the breakpoint is only supposed to be modified, sometimes in the middle of a debugging workflow. We don't want our slot to be stolen in the middle. So instead of unregistering/registering the breakpoint, just disable it while we modify its breakpoint fields and re-enable it after if necessary. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Prasad LKML-Reference: <1260347148-5519-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 57 ++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 31 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b361d28061d0..7079ddaf0731 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -595,7 +595,7 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } -static struct perf_event * +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, struct task_struct *tsk, int disabled) { @@ -609,11 +609,11 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, * written the address register first */ if (!bp) - return ERR_PTR(-EINVAL); + return -EINVAL; err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); if (err) - return ERR_PTR(err); + return err; attr = bp->attr; attr.bp_len = gen_len; @@ -658,28 +658,17 @@ restore: if (!second_pass) continue; - thread->ptrace_bps[i] = NULL; - bp = ptrace_modify_breakpoint(bp, len, type, + rc = ptrace_modify_breakpoint(bp, len, type, tsk, 1); - if (IS_ERR(bp)) { - rc = PTR_ERR(bp); - thread->ptrace_bps[i] = NULL; + if (rc) break; - } - thread->ptrace_bps[i] = bp; } continue; } - bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); - - /* Incorrect bp, or we have a bug in bp API */ - if (IS_ERR(bp)) { - rc = PTR_ERR(bp); - thread->ptrace_bps[i] = NULL; + rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + if (rc) break; - } - thread->ptrace_bps[i] = bp; } /* * Make a second pass to free the remaining unused breakpoints @@ -737,26 +726,32 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.disabled = 1; bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + + /* + * CHECKME: the previous code returned -EIO if the addr wasn't + * a valid task virtual addr. The new one will return -EINVAL in + * this case. + * -EINVAL may be what we want for in-kernel breakpoints users, + * but -EIO looks better for ptrace, since we refuse a register + * writing for the user. And anyway this is the previous + * behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; } else { + int err; + bp = t->ptrace_bps[nr]; - t->ptrace_bps[nr] = NULL; attr = bp->attr; attr.bp_addr = addr; - bp = modify_user_hw_breakpoint(bp, &attr); + err = modify_user_hw_breakpoint(bp, &attr); + if (err) + return err; } - /* - * CHECKME: the previous code returned -EIO if the addr wasn't a - * valid task virtual addr. The new one will return -EINVAL in this - * case. - * -EINVAL may be what we want for in-kernel breakpoints users, but - * -EIO looks better for ptrace, since we refuse a register writing - * for the user. And anyway this is the previous behaviour. - */ - if (IS_ERR(bp)) - return PTR_ERR(bp); - t->ptrace_bps[nr] = bp; return 0; } -- cgit v1.2.1 From 814e2c84a722c45650a9b8f52285d7ba6874f63b Mon Sep 17 00:00:00 2001 From: Andy Isaacson Date: Tue, 8 Dec 2009 00:29:42 -0800 Subject: x86: Factor duplicated code out of __show_regs() into show_regs_common() Unify x86_32 and x86_64 implementations of __show_regs() header, standardizing on the x86_64 format string in the process. Also, 32-bit will now call print_modules. Signed-off-by: Andy Isaacson Cc: Arjan van de Ven Cc: Robert Hancock Cc: Richard Zidlicky Cc: Andrew Morton LKML-Reference: <20091208082942.GA27174@hexapodia.org> [ v2: resolved conflict ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 18 ++++++++++++++++++ arch/x86/kernel/process_32.c | 14 +------------- arch/x86/kernel/process_64.c | 16 ++-------------- 3 files changed, 21 insertions(+), 27 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e2ba634ea15..90cf1250a005 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include #include @@ -90,6 +92,22 @@ void exit_thread(void) } } +void show_regs_common(void) +{ + const char *board; + + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!board) + board = ""; + + printk("\n"); + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, board); +} + void flush_thread(void) { struct task_struct *tsk = current; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 075580b35682..120b88797a75 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -35,7 +34,6 @@ #include #include #include -#include #include #include #include @@ -128,7 +126,6 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned long sp; unsigned short ss, gs; - const char *board; if (user_mode_vm(regs)) { sp = regs->sp; @@ -140,16 +137,7 @@ void __show_regs(struct pt_regs *regs, int all) savesegment(gs, gs); } - printk("\n"); - - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", - task_pid_nr(current), current->comm, - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + show_regs_common(); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c95c8f4e790a..e5ab0cd0ef36 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -38,7 +37,6 @@ #include #include #include -#include #include #include @@ -163,18 +161,8 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, cs, es; - const char *board; - - printk("\n"); - print_modules(); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + + show_regs_common(); printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, -- cgit v1.2.1 From a1884b8e558ef6395f6033f9e1b69b332dd040e0 Mon Sep 17 00:00:00 2001 From: Andy Isaacson Date: Tue, 8 Dec 2009 00:30:21 -0800 Subject: x86: Print DMI_BOARD_NAME as well as DMI_PRODUCT_NAME from __show_regs() Robert Hancock observes that DMI_BOARD_NAME is often more useful than DMI_PRODUCT_NAME, especially on standalone motherboards. So, print both. Signed-off-by: Andy Isaacson Cc: Arjan van de Ven Cc: Robert Hancock Cc: Richard Zidlicky Cc: Andrew Morton LKML-Reference: <20091208083021.GB27174@hexapodia.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 90cf1250a005..7a7bd4e3ec49 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -94,18 +94,21 @@ void exit_thread(void) void show_regs_common(void) { - const char *board; + const char *board, *product; - board = dmi_get_system_info(DMI_PRODUCT_NAME); + board = dmi_get_system_info(DMI_BOARD_NAME); if (!board) board = ""; + product = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!product) + product = ""; printk("\n"); - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); + init_utsname()->version, board, product); } void flush_thread(void) -- cgit v1.2.1 From e258e4e0b495e6ecbd073d6bef1eafb62a58919a Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:51 -0500 Subject: x86-32: Add new pt_regs stubs Add new stubs which add the pt_regs pointer as the last arg, matching 64-bit. This will allow these syscalls to be easily merged. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 49 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 50b9c220e121..34dbfa909dd7 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -725,22 +725,49 @@ END(syscall_badsys) /* * System calls that need a pt_regs pointer. */ -#define PTREGSCALL(name) \ +#define PTREGSCALL0(name) \ ALIGN; \ ptregs_##name: \ leal 4(%esp),%eax; \ jmp sys_##name; -PTREGSCALL(iopl) -PTREGSCALL(fork) -PTREGSCALL(clone) -PTREGSCALL(vfork) -PTREGSCALL(execve) -PTREGSCALL(sigaltstack) -PTREGSCALL(sigreturn) -PTREGSCALL(rt_sigreturn) -PTREGSCALL(vm86) -PTREGSCALL(vm86old) +#define PTREGSCALL1(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%edx; \ + movl PT_EBX(%edx),%eax; \ + jmp sys_##name; + +#define PTREGSCALL2(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%ecx; \ + movl PT_ECX(%ecx),%edx; \ + movl PT_EBX(%ecx),%eax; \ + jmp sys_##name; + +#define PTREGSCALL3(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%eax; \ + pushl %eax; \ + movl PT_EDX(%eax),%ecx; \ + movl PT_ECX(%eax),%edx; \ + movl PT_EBX(%eax),%eax; \ + call sys_##name; \ + addl $4,%esp; \ + ret + +PTREGSCALL0(iopl) +PTREGSCALL0(fork) +PTREGSCALL0(clone) +PTREGSCALL0(vfork) +PTREGSCALL0(execve) +PTREGSCALL0(sigaltstack) +PTREGSCALL0(sigreturn) +PTREGSCALL0(rt_sigreturn) +PTREGSCALL0(vm86) +PTREGSCALL0(vm86old) .macro FIXUP_ESPFIX_STACK /* -- cgit v1.2.1 From 27f59559d63375a4d59e7c720a439d9f0b47edad Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:52 -0500 Subject: x86: Merge sys_iopl Change 32-bit sys_iopl to PTREGSCALL1, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-3-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/ioport.c | 28 +++++----------------------- 2 files changed, 6 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 34dbfa909dd7..ab7fcef37453 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -758,7 +758,7 @@ ptregs_##name: \ addl $4,%esp; \ ret -PTREGSCALL0(iopl) +PTREGSCALL1(iopl) PTREGSCALL0(fork) PTREGSCALL0(clone) PTREGSCALL0(vfork) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 99c4d308f16b..85ecc7c57ba6 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -103,9 +103,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * on system-call entry - see also fork() and the signal handling * code. */ -static int do_iopl(unsigned int level, struct pt_regs *regs) +long sys_iopl(unsigned int level, struct pt_regs *regs) { unsigned int old = (regs->flags >> 12) & 3; + struct thread_struct *t = ¤t->thread; if (level > 3) return -EINVAL; @@ -115,29 +116,10 @@ static int do_iopl(unsigned int level, struct pt_regs *regs) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); - - return 0; -} - #ifdef CONFIG_X86_32 -long sys_iopl(struct pt_regs *regs) -{ - unsigned int level = regs->bx; - struct thread_struct *t = ¤t->thread; - int rc; - - rc = do_iopl(level, regs); - if (rc < 0) - goto out; - t->iopl = level << 12; set_iopl_mask(t->iopl); -out: - return rc; -} -#else -asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -{ - return do_iopl(level, regs); -} #endif + + return 0; +} -- cgit v1.2.1 From 11cf88bd0b8165b65aaabaee0977e9a3ad474ab7 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:53 -0500 Subject: x86: Merge sys_execve Change 32-bit sys_execve to PTREGSCALL3, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-4-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/process.c | 26 ++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 25 ------------------------- arch/x86/kernel/process_64.c | 19 ------------------- 4 files changed, 27 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index ab7fcef37453..a96a0d8a0fdb 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -762,7 +762,7 @@ PTREGSCALL1(iopl) PTREGSCALL0(fork) PTREGSCALL0(clone) PTREGSCALL0(vfork) -PTREGSCALL0(execve) +PTREGSCALL3(execve) PTREGSCALL0(sigaltstack) PTREGSCALL0(sigreturn) PTREGSCALL0(rt_sigreturn) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5e2ba634ea15..bb17bd9334fb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -235,6 +235,32 @@ int sys_vfork(struct pt_regs *regs) } +/* + * sys_execve() executes a new program. + */ +long sys_execve(char __user *name, char __user * __user *argv, + char __user * __user *envp, struct pt_regs *regs) +{ + long error; + char *filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, regs); + +#ifdef CONFIG_X86_32 + if (error == 0) { + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); + } +#endif + + putname(filename); + return error; +} + /* * Idle related variables and functions */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 075580b35682..486e38e2900b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -451,31 +451,6 @@ int sys_clone(struct pt_regs *regs) return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); } -/* - * sys_execve() executes a new program. - */ -int sys_execve(struct pt_regs *regs) -{ - int error; - char *filename; - - filename = getname((char __user *) regs->bx); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - error = do_execve(filename, - (char __user * __user *) regs->cx, - (char __user * __user *) regs->dx, - regs); - if (error == 0) { - /* Make sure we don't return using sysenter.. */ - set_thread_flag(TIF_IRET); - } - putname(filename); -out: - return error; -} - #define top_esp (THREAD_SIZE - sizeof(unsigned long)) #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c95c8f4e790a..671960d82587 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -520,25 +520,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } -/* - * sys_execve() executes a new program. - */ -asmlinkage -long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs *regs) -{ - long error; - char *filename; - - filename = getname(name); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - return error; - error = do_execve(filename, argv, envp, regs); - putname(filename); - return error; -} - void set_personality_64bit(void) { /* inherit personality from parent */ -- cgit v1.2.1 From 052acad48a566a6dbcccb95e5d22e5e1b7cac8dd Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:54 -0500 Subject: x86: Merge sys_sigaltstack Change 32-bit sys_sigaltstack to PTREGSCALL2, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-5-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/signal.c | 12 +----------- 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a96a0d8a0fdb..621ef4599416 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -763,7 +763,7 @@ PTREGSCALL0(fork) PTREGSCALL0(clone) PTREGSCALL0(vfork) PTREGSCALL3(execve) -PTREGSCALL0(sigaltstack) +PTREGSCALL2(sigaltstack) PTREGSCALL0(sigreturn) PTREGSCALL0(rt_sigreturn) PTREGSCALL0(vm86) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 74fe6d86dc5d..4fd173cd8e57 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -545,22 +545,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, } #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_X86_32 -int sys_sigaltstack(struct pt_regs *regs) -{ - const stack_t __user *uss = (const stack_t __user *)regs->bx; - stack_t __user *uoss = (stack_t __user *)regs->cx; - - return do_sigaltstack(uss, uoss, regs->sp); -} -#else /* !CONFIG_X86_32 */ -asmlinkage long +long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) { return do_sigaltstack(uss, uoss, regs->sp); } -#endif /* CONFIG_X86_32 */ /* * Do a signal return; undo the signal stack. -- cgit v1.2.1 From f1382f157fb1175bba008abad0907310a1e459ce Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:55 -0500 Subject: x86, 32-bit: Convert sys_vm86 & sys_vm86old Convert these to new PTREGSCALL stubs. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-6-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 4 ++-- arch/x86/kernel/vm86_32.c | 11 +++++------ 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 621ef4599416..6c2f25d9b9d5 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -766,8 +766,8 @@ PTREGSCALL3(execve) PTREGSCALL2(sigaltstack) PTREGSCALL0(sigreturn) PTREGSCALL0(rt_sigreturn) -PTREGSCALL0(vm86) -PTREGSCALL0(vm86old) +PTREGSCALL2(vm86) +PTREGSCALL1(vm86old) .macro FIXUP_ESPFIX_STACK /* diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 9c4e62539058..5ffb5622f793 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -197,9 +197,8 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); -int sys_vm86old(struct pt_regs *regs) +int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs) { - struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx; struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. * This remains on the stack until we @@ -227,7 +226,7 @@ out: } -int sys_vm86(struct pt_regs *regs) +int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs) { struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. @@ -239,12 +238,12 @@ int sys_vm86(struct pt_regs *regs) struct vm86plus_struct __user *v86; tsk = current; - switch (regs->bx) { + switch (cmd) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: case VM86_GET_IRQ_BITS: case VM86_GET_AND_RESET_IRQ: - ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); + ret = do_vm86_irq_handling(cmd, (int)arg); goto out; case VM86_PLUS_INSTALL_CHECK: /* @@ -261,7 +260,7 @@ int sys_vm86(struct pt_regs *regs) ret = -EPERM; if (tsk->thread.saved_sp0) goto out; - v86 = (struct vm86plus_struct __user *)regs->cx; + v86 = (struct vm86plus_struct __user *)arg; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, regs32) - sizeof(info.regs)); -- cgit v1.2.1 From f839bbc5c81b1c92ff8e81c360e9564f7b961b2e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 19:01:56 -0500 Subject: x86: Merge sys_clone Change 32-bit sys_clone to new PTREGSCALL stub, and merge with 64-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260403316-5679-7-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 14 +++++++++++++- arch/x86/kernel/process.c | 9 +++++++++ arch/x86/kernel/process_32.c | 15 --------------- arch/x86/kernel/process_64.c | 9 --------- 4 files changed, 22 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6c2f25d9b9d5..6492555d123d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -760,7 +760,6 @@ ptregs_##name: \ PTREGSCALL1(iopl) PTREGSCALL0(fork) -PTREGSCALL0(clone) PTREGSCALL0(vfork) PTREGSCALL3(execve) PTREGSCALL2(sigaltstack) @@ -769,6 +768,19 @@ PTREGSCALL0(rt_sigreturn) PTREGSCALL2(vm86) PTREGSCALL1(vm86old) +/* Clone is an oddball. The 4th arg is in %edi */ + ALIGN; +ptregs_clone: + leal 4(%esp),%eax + pushl %eax + pushl PT_EDI(%eax) + movl PT_EDX(%eax),%ecx + movl PT_ECX(%eax),%edx + movl PT_EBX(%eax),%eax + call sys_clone + addl $8,%esp + ret + .macro FIXUP_ESPFIX_STACK /* * Switch back for ESPFIX stack to the normal zerobased stack diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index bb17bd9334fb..f3c1a6b3a65e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -234,6 +234,15 @@ int sys_vfork(struct pt_regs *regs) NULL, NULL); } +long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + /* * sys_execve() executes a new program. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 486e38e2900b..506d5a7ba17c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -436,21 +436,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } -int sys_clone(struct pt_regs *regs) -{ - unsigned long clone_flags; - unsigned long newsp; - int __user *parent_tidptr, *child_tidptr; - - clone_flags = regs->bx; - newsp = regs->cx; - parent_tidptr = (int __user *)regs->dx; - child_tidptr = (int __user *)regs->di; - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); -} - #define top_esp (THREAD_SIZE - sizeof(unsigned long)) #define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 671960d82587..83019f94b83d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -534,15 +534,6 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -asmlinkage long -sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) -{ - if (!newsp) - newsp = regs->sp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - unsigned long get_wchan(struct task_struct *p) { unsigned long stack; -- cgit v1.2.1 From ce9119ad90b1caba550447bfcc0a21850558ca49 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 9 Dec 2009 16:33:44 -0800 Subject: x86-32: Avoid pipeline serialization in PTREGSCALL1 and 2 In the PTREGSCALL1 and 2 macros, we can trivially avoid an unnecessary pipeline serialization, so do so. In PTREGSCALLS3 this is much less clear-cut since we have to push a new value to the stack. Leave it alone for now assuming it is as good as it is going to be; may want to check on Atom or another in-order x86 to see if we can do better. Signed-off-by: H. Peter Anvin Cc: Brian Gerst LKML-Reference: <1260403316-5679-2-git-send-email-brgerst@gmail.com> --- arch/x86/kernel/entry_32.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6492555d123d..cb12b9bfc9cc 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -735,15 +735,15 @@ ptregs_##name: \ ALIGN; \ ptregs_##name: \ leal 4(%esp),%edx; \ - movl PT_EBX(%edx),%eax; \ + movl (PT_EBX+4)(%esp),%eax; \ jmp sys_##name; #define PTREGSCALL2(name) \ ALIGN; \ ptregs_##name: \ leal 4(%esp),%ecx; \ - movl PT_ECX(%ecx),%edx; \ - movl PT_EBX(%ecx),%eax; \ + movl (PT_ECX+4)(%esp),%edx; \ + movl (PT_EBX+4)(%esp),%eax; \ jmp sys_##name; #define PTREGSCALL3(name) \ -- cgit v1.2.1 From fc380ceed7fe469728ea4acdbda4495ea943ee1c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 9 Dec 2009 16:54:08 -0800 Subject: x86-64, paravirt: Call set_iopl_mask() on 64 bits set_iopl_mask() is a no-op on 64 bits, but it is also a paravirt hook, so call it even on 64 bits. Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Brian Gerst LKML-Reference: <1260403316-5679-3-git-send-email-brgerst@gmail.com> --- arch/x86/kernel/ioport.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 85ecc7c57ba6..8eec0ec59af2 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -116,10 +116,8 @@ long sys_iopl(unsigned int level, struct pt_regs *regs) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); -#ifdef CONFIG_X86_32 t->iopl = level << 12; set_iopl_mask(t->iopl); -#endif return 0; } -- cgit v1.2.1 From 5cd476effe9570d2e520cf904d51f5c992972647 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 9 Dec 2009 10:45:33 -0800 Subject: x86: es7000_32.c: Use pr_ and add pr_fmt(fmt) - Added #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Converted a few printk(KERN_INFO to pr_info( - Stripped "es7000_mipcfg" from pr_debug Signed-off-by: Joe Perches LKML-Reference: <3b4375af246dec5941168858910210937c110af9.1260383912.git.joe@perches.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index e85f8fb7f8e7..dd2b5f264643 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -27,6 +27,9 @@ * * http://www.unisys.com */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -223,9 +226,9 @@ static int parse_unisys_oem(char *oemptr) mip_addr = val; mip = (struct mip_reg *)val; mip_reg = __va(mip); - pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", + pr_debug("host_reg = 0x%lx\n", (unsigned long)host_reg); - pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", + pr_debug("mip_reg = 0x%lx\n", (unsigned long)mip_reg); success++; break; @@ -401,7 +404,7 @@ static void es7000_enable_apic_mode(void) if (!es7000_plat) return; - printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); + pr_info("Enabling APIC mode.\n"); memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); es7000_mip_reg.off_0x00 = MIP_SW_APIC; es7000_mip_reg.off_0x38 = MIP_VALID; @@ -514,8 +517,7 @@ static void es7000_setup_apic_routing(void) { int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); - printk(KERN_INFO - "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", + pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", (apic_version[apic] == 0x14) ? "Physical Cluster" : "Logical Cluster", nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); -- cgit v1.2.1 From 40685236b3161b80030a4df34bcbc5941ea59876 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 9 Dec 2009 10:45:34 -0800 Subject: x86: setup_percpu.c: Use pr_ and add pr_fmt(fmt) - Added #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Stripped PERCPU: from a pr_warning Signed-off-by: Joe Perches LKML-Reference: <7ead24eccbea8f2b11795abad3e2893a98e1e111.1260383912.git.joe@perches.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index d559af913e1f..35abcb8b00e9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -20,9 +22,9 @@ #include #ifdef CONFIG_DEBUG_PER_CPU_MAPS -# define DBG(x...) printk(KERN_DEBUG x) +# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__) #else -# define DBG(x...) +# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0) #endif DEFINE_PER_CPU(int, cpu_number); @@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, } else { ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), size, align, goal); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", + cpu, size, node, __pa(ptr)); } return ptr; #else @@ -198,8 +200,7 @@ void __init setup_per_cpu_areas(void) pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - pr_warning("PERCPU: %s allocator failed (%d), " - "falling back to page size\n", + pr_warning("%s allocator failed (%d), falling back to page size\n", pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) -- cgit v1.2.1 From b7cc9554bc73641c9ed4d7eb74b2d6e78f20abea Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 10 Dec 2009 11:03:39 +0100 Subject: x86/amd-iommu: Fix passthrough mode The data structure changes to use dev->archdata.iommu field broke the iommu=pt mode because in this case the dev->archdata.iommu was left uninitialized. This moves the inititalization of the devices into the main init function and fixes the problem. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 39 +++++++++++++++++++++++++++++++++++++-- arch/x86/kernel/amd_iommu_init.c | 7 +++++++ 2 files changed, 44 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 32fb09102a13..450dd6ac03d3 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -166,6 +166,43 @@ static void iommu_uninit_device(struct device *dev) { kfree(dev->archdata.iommu); } + +void __init amd_iommu_uninit_devices(void) +{ + struct pci_dev *pdev = NULL; + + for_each_pci_dev(pdev) { + + if (!check_device(&pdev->dev)) + continue; + + iommu_uninit_device(&pdev->dev); + } +} + +int __init amd_iommu_init_devices(void) +{ + struct pci_dev *pdev = NULL; + int ret = 0; + + for_each_pci_dev(pdev) { + + if (!check_device(&pdev->dev)) + continue; + + ret = iommu_init_device(&pdev->dev); + if (ret) + goto out_free; + } + + return 0; + +out_free: + + amd_iommu_uninit_devices(); + + return ret; +} #ifdef CONFIG_AMD_IOMMU_STATS /* @@ -2145,8 +2182,6 @@ static void prealloc_protection_domains(void) if (!check_device(&dev->dev)) continue; - iommu_init_device(&dev->dev); - /* Is there already any domain for it? */ if (domain_for_device(&dev->dev)) continue; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 7ffc39965233..df01c691d130 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1274,6 +1274,10 @@ static int __init amd_iommu_init(void) if (ret) goto free; + ret = amd_iommu_init_devices(); + if (ret) + goto free; + if (iommu_pass_through) ret = amd_iommu_init_passthrough(); else @@ -1296,6 +1300,9 @@ out: return ret; free: + + amd_iommu_uninit_devices(); + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); -- cgit v1.2.1 From 8638c4914f34fedc1c13b1cc13f6d1e5a78c46b4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 10 Dec 2009 11:12:25 +0100 Subject: x86/amd-iommu: Fix PCI hotplug with passthrough mode The device change notifier is initialized in the dma_ops initialization path. But this path is never executed for iommu=pt. Move the notifier initialization to IOMMU hardware init code to fix this. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 7 +++++-- arch/x86/kernel/amd_iommu_init.c | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 450dd6ac03d3..a83185080e91 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1624,6 +1624,11 @@ static struct notifier_block device_nb = { .notifier_call = device_change_notifier, }; +void amd_iommu_init_notifier(void) +{ + bus_register_notifier(&pci_bus_type, &device_nb); +} + /***************************************************************************** * * The next functions belong to the dma_ops mapping/unmapping code. @@ -2250,8 +2255,6 @@ int __init amd_iommu_init_dma_ops(void) register_iommu(&amd_iommu_ops); - bus_register_notifier(&pci_bus_type, &device_nb); - amd_iommu_stats_init(); return 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index df01c691d130..309a52f96e0b 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1285,6 +1285,8 @@ static int __init amd_iommu_init(void) if (ret) goto free; + amd_iommu_init_notifier(); + enable_iommus(); if (iommu_pass_through) -- cgit v1.2.1 From 5e855db5d8fec44e6604eb245aa9077bbd3f0d05 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 10 Dec 2009 17:08:54 +0800 Subject: perf_event: Fix variable initialization in other codepaths Signed-off-by: Xiao Guangrong Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras LKML-Reference: <4B20BAA6.7010609@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d35f26076ae5..1342f236e32a 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1632,6 +1632,7 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) data.period = event->hw.last_period; data.addr = 0; + data.raw = NULL; regs.ip = 0; /* @@ -1749,6 +1750,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) u64 val; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); @@ -1794,6 +1796,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) u64 ack, status; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); @@ -1857,6 +1860,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) u64 val; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); -- cgit v1.2.1 From 125580380f418000b1a06d9a54700f1191b6e561 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 10 Dec 2009 19:56:34 +0300 Subject: x86, perf events: Check if we have APIC enabled Ralf Hildebrandt reported this boot warning: | Running a vanilla 2.6.32 as Xen DomU, I'm getting: | | [ 0.000999] CPU: Physical Processor ID: 0 | [ 0.000999] CPU: Processor Core ID: 1 | [ 0.000999] Performance Events: AMD PMU driver. | [ 0.000999] ------------[ cut here ]------------ | [ 0.000999] WARNING: at arch/x86/kernel/apic/apic.c:249 native_apic_write_dummy So we need to check if APIC functionality is available, and not just in the P6 driver but elsewhere as well. Reported-by: Ralf Hildebrandt Signed-off-by: Cyrill Gorcunov Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20091210165634.GF5086@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1342f236e32a..18f05eccbb62 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2066,12 +2066,6 @@ static __init int p6_pmu_init(void) x86_pmu = p6_pmu; - if (!cpu_has_apic) { - pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - pr_info("no hardware sampling interrupt available.\n"); - x86_pmu.apic = 0; - } - return 0; } @@ -2163,6 +2157,16 @@ static __init int amd_pmu_init(void) return 0; } +static void __init pmu_check_apic(void) +{ + if (cpu_has_apic) + return; + + x86_pmu.apic = 0; + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); +} + void __init init_hw_perf_events(void) { int err; @@ -2184,6 +2188,8 @@ void __init init_hw_perf_events(void) return; } + pmu_check_apic(); + pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { -- cgit v1.2.1 From 3bd95dfb182969dc6d2a317c150e0df7107608d3 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:40 -0500 Subject: x86, 64-bit: Move kernel_thread to C Prepare for merging with 32-bit. Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 49 +++------------------------------------- arch/x86/kernel/process_64.c | 31 +++++++++++++++++++++++-- arch/x86/kernel/x8664_ksyms_64.c | 2 -- 3 files changed, 32 insertions(+), 50 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 63bca794c8f9..73d9b2c0e217 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1166,63 +1166,20 @@ bad_gs: jmp 2b .previous -/* - * Create a kernel thread. - * - * C extern interface: - * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) - * - * asm input arguments: - * rdi: fn, rsi: arg, rdx: flags - */ -ENTRY(kernel_thread) - CFI_STARTPROC - FAKE_STACK_FRAME $child_rip - SAVE_ALL - - # rdi: flags, rsi: usp, rdx: will be &pt_regs - movq %rdx,%rdi - orq kernel_thread_flags(%rip),%rdi - movq $-1, %rsi - movq %rsp, %rdx - - xorl %r8d,%r8d - xorl %r9d,%r9d - - # clone now - call do_fork - movq %rax,RAX(%rsp) - xorl %edi,%edi - - /* - * It isn't worth to check for reschedule here, - * so internally to the x86_64 port you can rely on kernel_thread() - * not to reschedule the child before returning, this avoids the need - * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] - */ - RESTORE_ALL - UNFAKE_STACK_FRAME - ret - CFI_ENDPROC -END(kernel_thread) - -ENTRY(child_rip) +ENTRY(kernel_thread_helper) pushq $0 # fake return address CFI_STARTPROC /* * Here we are in the child and the registers are set as they were * at kernel_thread() invocation in the parent. */ - movq %rdi, %rax - movq %rsi, %rdi - call *%rax + call *%rsi # exit mov %eax, %edi call do_exit ud2 # padding for call trace CFI_ENDPROC -END(child_rip) +END(kernel_thread_helper) /* * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 83019f94b83d..92484c2130c6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -59,8 +59,6 @@ asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); -unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; - static ATOMIC_NOTIFIER_HEAD(idle_notifier); void idle_notifier_register(struct notifier_block *n) @@ -231,6 +229,35 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } +/* + * This gets run with %si containing the + * function to call, and %di containing + * the "args". + */ +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; + + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS; + regs.flags = X86_EFLAGS_IF; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, ~0UL, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); + void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a1029769b6f2..9fafaf83b3b8 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -17,8 +17,6 @@ EXPORT_SYMBOL(mcount); #endif -EXPORT_SYMBOL(kernel_thread); - EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); EXPORT_SYMBOL(__get_user_4); -- cgit v1.2.1 From fa4b8f84383ae197e643a46c36bf58ab8dffc95c Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:41 -0500 Subject: x86, 64-bit: Use user_mode() to determine new stack pointer in copy_thread() Use user_mode() instead of a magic value for sp to determine when returning to kernel mode. Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-3-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_64.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 92484c2130c6..00ac66fa5c6b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -254,7 +254,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.flags = X86_EFLAGS_IF; /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, ~0UL, ®s, 0, NULL, NULL); + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); } EXPORT_SYMBOL(kernel_thread); @@ -312,8 +312,9 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, *childregs = *regs; childregs->ax = 0; - childregs->sp = sp; - if (sp == ~0UL) + if (user_mode(regs)) + childregs->sp = sp; + else childregs->sp = (unsigned long)childregs; p->thread.sp = (unsigned long) childregs; -- cgit v1.2.1 From e840227c141116171c89ab1abb5cc9fee6fdb488 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:42 -0500 Subject: x86, 32-bit: Use same regs as 64-bit for kernel_thread_helper The arg should be in %eax, but that is clobbered by the return value of clone. The function pointer can be in any register. Also, don't push args onto the stack, since regparm(3) is the normal calling convention now. Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-4-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 8 ++------ arch/x86/kernel/process_32.c | 8 ++++---- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cb12b9bfc9cc..44a8e0dc6737 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1047,12 +1047,8 @@ END(spurious_interrupt_bug) ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder CFI_STARTPROC - movl %edx,%eax - push %edx - CFI_ADJUST_CFA_OFFSET 4 - call *%ebx - push %eax - CFI_ADJUST_CFA_OFFSET 4 + movl %edi,%eax + call *%esi call do_exit ud2 # padding for call trace CFI_ENDPROC diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 506d5a7ba17c..bd874d2b6ab1 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -193,8 +193,8 @@ void show_regs(struct pt_regs *regs) } /* - * This gets run with %bx containing the - * function to call, and %dx containing + * This gets run with %si containing the + * function to call, and %di containing * the "args". */ extern void kernel_thread_helper(void); @@ -208,8 +208,8 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) memset(®s, 0, sizeof(regs)); - regs.bx = (unsigned long) fn; - regs.dx = (unsigned long) arg; + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; regs.ds = __USER_DS; regs.es = __USER_DS; -- cgit v1.2.1 From f443ff4201dd25cd4dec183f9919ecba90c8edc2 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:43 -0500 Subject: x86: Sync 32/64-bit kernel_thread Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-5-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process_32.c | 5 ++++- arch/x86/kernel/process_64.c | 11 +++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd874d2b6ab1..f2e8b05a4f02 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -211,14 +211,17 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.si = (unsigned long) fn; regs.di = (unsigned long) arg; +#ifdef CONFIG_X86_32 regs.ds = __USER_DS; regs.es = __USER_DS; regs.fs = __KERNEL_PERCPU; regs.gs = __KERNEL_STACK_CANARY; +#endif + regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + regs.flags = X86_EFLAGS_IF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 00ac66fa5c6b..d49a9094f6f3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -248,10 +248,17 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.si = (unsigned long) fn; regs.di = (unsigned long) arg; +#ifdef CONFIG_X86_32 + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; +#endif + regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS; - regs.flags = X86_EFLAGS_IF; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -- cgit v1.2.1 From df59e7bf439918f523ac29e996ec1eebbed60440 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 9 Dec 2009 12:34:44 -0500 Subject: x86: Merge kernel_thread() Signed-off-by: Brian Gerst LKML-Reference: <1260380084-3707-6-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process.c | 35 +++++++++++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 36 ------------------------------------ arch/x86/kernel/process_64.c | 36 ------------------------------------ 3 files changed, 35 insertions(+), 72 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f3c1a6b3a65e..8705ccedd447 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -243,6 +243,41 @@ sys_clone(unsigned long clone_flags, unsigned long newsp, return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } +/* + * This gets run with %si containing the + * function to call, and %di containing + * the "args". + */ +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; + +#ifdef CONFIG_X86_32 + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; +#endif + + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | 0x2; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); /* * sys_execve() executes a new program. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f2e8b05a4f02..ccf234266a2e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -192,42 +192,6 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, regs, ®s->sp, regs->bp); } -/* - * This gets run with %si containing the - * function to call, and %di containing - * the "args". - */ -extern void kernel_thread_helper(void); - -/* - * Create a kernel thread - */ -int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) -{ - struct pt_regs regs; - - memset(®s, 0, sizeof(regs)); - - regs.si = (unsigned long) fn; - regs.di = (unsigned long) arg; - -#ifdef CONFIG_X86_32 - regs.ds = __USER_DS; - regs.es = __USER_DS; - regs.fs = __KERNEL_PERCPU; - regs.gs = __KERNEL_STACK_CANARY; -#endif - - regs.orig_ax = -1; - regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | 0x2; - - /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -} -EXPORT_SYMBOL(kernel_thread); - void release_thread(struct task_struct *dead_task) { BUG_ON(dead_task->mm); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index d49a9094f6f3..1a362c5bec37 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -229,42 +229,6 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } -/* - * This gets run with %si containing the - * function to call, and %di containing - * the "args". - */ -extern void kernel_thread_helper(void); - -/* - * Create a kernel thread - */ -int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) -{ - struct pt_regs regs; - - memset(®s, 0, sizeof(regs)); - - regs.si = (unsigned long) fn; - regs.di = (unsigned long) arg; - -#ifdef CONFIG_X86_32 - regs.ds = __USER_DS; - regs.es = __USER_DS; - regs.fs = __KERNEL_PERCPU; - regs.gs = __KERNEL_STACK_CANARY; -#endif - - regs.orig_ax = -1; - regs.ip = (unsigned long) kernel_thread_helper; - regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | 0x2; - - /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -} -EXPORT_SYMBOL(kernel_thread); - void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { -- cgit v1.2.1 From ebb682f522411abbe358059a256a8672ec0bd55b Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 9 Dec 2009 13:36:45 -0500 Subject: x86, AMD: Fix stale cpuid4_info shared_map data in shared_cpu_map cpumasks The per_cpu cpuid4_info shared_map can contain stale data when CPUs are added and removed. The stale data can lead to a NULL pointer derefernce panic on a remove of a CPU that has had siblings previously removed. This patch resolves the panic by verifying a cpu is actually online before adding it to the shared_cpu_map, only examining cpus that are part of the same lower level cache, and by updating other siblings lowest level cache maps when a cpu is added. Signed-off-by: Prarit Bhargava LKML-Reference: <20091209183336.17855.98708.sendpatchset@prarit.bos.redhat.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6c40f6b5b340..63ada177b40c 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -507,18 +507,19 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf, *sibling_leaf; unsigned long num_threads_sharing; - int index_msb, i; + int index_msb, i, sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { - struct cpuinfo_x86 *d; - for_each_online_cpu(i) { + for_each_cpu(i, c->llc_shared_map) { if (!per_cpu(cpuid4_info, i)) continue; - d = &cpu_data(i); this_leaf = CPUID4_INFO_IDX(i, index); - cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), - d->llc_shared_map); + for_each_cpu(sibling, c->llc_shared_map) { + if (!cpu_online(sibling)) + continue; + set_bit(sibling, this_leaf->shared_cpu_map); + } } return; } -- cgit v1.2.1 From 893f38d144a4d96d2483cd7c3801d26e1b2c23e9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 10 Dec 2009 13:07:22 -0800 Subject: x86: Use find_e820() instead of hard coded trampoline address Jens found the following crash/regression: [ 0.000000] found SMP MP-table at [ffff8800000fdd80] fdd80 [ 0.000000] Kernel panic - not syncing: Overlapping early reservations 12-f011 MP-table mpc to 0-fff BIOS data page and [ 0.000000] Kernel panic - not syncing: Overlapping early reservations 12-f011 MP-table mpc to 6000-7fff TRAMPOLINE and bisected it to b24c2a9 ("x86: Move find_smp_config() earlier and avoid bootmem usage"). It turns out the BIOS is using the first 64k for mptable, without reserving it. So try to find good range for the real-mode trampoline instead of hard coding it, in case some bios tries to use that range for sth. Reported-by: Jens Axboe Signed-off-by: Yinghai Lu Tested-by: Jens Axboe Cc: Randy Dunlap LKML-Reference: <4B21630A.6000308@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 11 ++++++++++- arch/x86/kernel/head32.c | 2 -- arch/x86/kernel/head64.c | 2 -- arch/x86/kernel/mpparse.c | 3 --- arch/x86/kernel/setup.c | 13 ++++++++----- arch/x86/kernel/trampoline.c | 20 +++++++++----------- 6 files changed, 27 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d17d482a04f4..f50447d961c0 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -732,7 +732,16 @@ struct early_res { char overlap_ok; }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { - { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ + { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ +#ifdef CONFIG_X86_32 + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 }, +#endif + {} }; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 4f8e2507e8f3..5051b94c9069 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,8 +29,6 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { - reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 0b06cd778fd9..b5a9896ca1e7 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -98,8 +98,6 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); - reserve_trampoline_memory(); - reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 35a57c963df9..40b54ceb68b5 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -945,9 +945,6 @@ void __init early_reserve_e820_mpc_new(void) { if (enable_update_mptable && alloc_mptable) { u64 startt = 0; -#ifdef CONFIG_X86_TRAMPOLINE - startt = TRAMPOLINE_BASE; -#endif mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); } } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 946a311a25c9..f7b8b9894b22 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -73,6 +73,7 @@ #include #include +#include #include #include #include @@ -875,6 +876,13 @@ void __init setup_arch(char **cmdline_p) reserve_brk(); + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); + + reserve_trampoline_memory(); + #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. @@ -921,11 +929,6 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); - #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index cd022121cab6..c652ef62742d 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -12,21 +12,19 @@ #endif /* ready for x86_64 and x86 */ -unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE); +unsigned char *__trampinitdata trampoline_base; void __init reserve_trampoline_memory(void) { -#ifdef CONFIG_X86_32 - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); -#endif + unsigned long mem; + /* Has to be in very low memory so we can execute real-mode AP code. */ - reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, - "TRAMPOLINE"); + mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); + if (mem == -1L) + panic("Cannot allocate trampoline\n"); + + trampoline_base = __va(mem); + reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); } /* -- cgit v1.2.1 From f8b7256096a20436f6d0926747e3ac3d64c81d24 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Nov 2009 17:37:04 -0500 Subject: Unify sys_mmap* New helper - sys_mmap_pgoff(); switch syscalls to using it. Acked-by: David S. Miller Signed-off-by: Al Viro --- arch/x86/kernel/sys_i386_32.c | 27 +-------------------------- arch/x86/kernel/sys_x86_64.c | 17 +---------------- arch/x86/kernel/syscall_table_32.S | 2 +- 3 files changed, 3 insertions(+), 43 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 1884a8d12bfa..dee1ff7cba58 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,31 +24,6 @@ #include -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - struct mm_struct *mm = current->mm; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/i386 didn't use to be able to handle more than @@ -77,7 +52,7 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) if (a.offset & ~PAGE_MASK) goto out; - err = sys_mmap2(a.addr, a.len, a.prot, a.flags, + err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return err; diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 45e00eb09c3a..8aa2057efd12 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, fd, unsigned long, off) { long error; - struct file *file; - error = -EINVAL; if (off & ~PAGE_MASK) goto out; - error = -EBADF; - file = NULL; - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); + error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return error; } diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 70c2125d55b9..15228b5d3eb7 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -191,7 +191,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* reserved for streams2 */ .long ptregs_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ -- cgit v1.2.1 From a5d09d68335bb8422d5e7050c9f03f99ba6cfebd Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Fri, 11 Dec 2009 08:43:12 -0600 Subject: kgdb,x86: remove redundant test The for loop starts with a breakno of 0, and ends when it's 4. so this test is always true. Signed-off-by: Roel Kluin Signed-off-by: Andrew Morton Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 20a5b3689463..f93d015753ce 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -220,8 +220,7 @@ static void kgdb_correct_hw_break(void) dr7 |= ((breakinfo[breakno].len << 2) | breakinfo[breakno].type) << ((breakno << 2) + 16); - if (breakno >= 0 && breakno <= 3) - set_debugreg(breakinfo[breakno].addr, breakno); + set_debugreg(breakinfo[breakno].addr, breakno); } else { if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { -- cgit v1.2.1 From cf6f196d112a6f6757b1ca3cce0b576f7abee479 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 11 Dec 2009 08:43:16 -0600 Subject: kgdb,i386: Fix corner case access to ss with NMI watch dog exception It is possible for the user_mode_vm(regs) check to return true on the i368 arch for a non master kgdb cpu or when the master kgdb cpu handles the NMI watch dog exception. The solution is simply to select the correct gdb_ss location based on the check to user_mode_vm(regs). CC: Ingo Molnar Acked-by: H. Peter Anvin Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index f93d015753ce..aefae46aa646 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -86,9 +86,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_DS] = regs->ds; gdb_regs[GDB_ES] = regs->es; gdb_regs[GDB_CS] = regs->cs; - gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; + if (user_mode_vm(regs)) { + gdb_regs[GDB_SS] = regs->ss; + gdb_regs[GDB_SP] = regs->sp; + } else { + gdb_regs[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + } #else gdb_regs[GDB_R8] = regs->r8; gdb_regs[GDB_R9] = regs->r9; @@ -101,8 +107,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs32[GDB_PS] = regs->flags; gdb_regs32[GDB_CS] = regs->cs; gdb_regs32[GDB_SS] = regs->ss; -#endif gdb_regs[GDB_SP] = kernel_stack_pointer(regs); +#endif } /** -- cgit v1.2.1 From 8097551d9ab9b9e3630694ad1bc6e12c597c515e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 11 Dec 2009 08:43:18 -0600 Subject: kgdb,x86: do not set kgdb_single_step on x86 On an SMP system the kgdb_single_step flag has the possibility to indefinitely hang the system in the case. Consider the case where, CPU 1 has the schedule lock and CPU 0 is set to single step, there is no way for CPU 0 to run another task. The easy way to observe the problem is to make 2 cpus busy, and run the kgdb test suite. You will see that it hangs the system very quickly. while [ 1 ] ; do find /proc > /dev/null 2>&1 ; done & while [ 1 ] ; do find /proc > /dev/null 2>&1 ; done & echo V1 > /sys/module/kgdbts/parameters/kgdbts The side effect of this patch is that there is the possibility to miss a breakpoint in the case that a single step operation was executed to step over a breakpoint in common code. The trade off of the missed breakpoint is preferred to hanging the kernel. This can be fixed in the future by using kprobes or another strategy to step over planted breakpoints with out of line execution. CC: Ingo Molnar Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index aefae46aa646..dd74fe7273b1 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -400,7 +400,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, /* set the trace bit if we're stepping */ if (remcomInBuffer[0] == 's') { linux_regs->flags |= X86_EFLAGS_TF; - kgdb_single_step = 1; atomic_set(&kgdb_cpu_doing_single_step, raw_smp_processor_id()); } -- cgit v1.2.1 From 450b1e8dd10f41b5adad73f48ce8f6707d17c5c4 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 11 Dec 2009 08:08:50 -0800 Subject: x86: Remove enabling x2apic message for every CPU Print only once that the system is supporting x2apic mode. Signed-off-by: Mike Travis Acked-by: Cyrill Gorcunov LKML-Reference: <4B226E92.5080904@sgi.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index efb2b9cd132c..aa57c079c98f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1341,7 +1341,7 @@ void enable_x2apic(void) rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { - pr_info("Enabling x2apic\n"); + printk_once(KERN_INFO "Enabling x2apic\n"); wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } -- cgit v1.2.1 From 2eaad1fddd7450a48ad464229775f97fbfe8af36 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 10 Dec 2009 17:19:36 -0800 Subject: x86: Limit the number of processor bootup messages When there are a large number of processors in a system, there is an excessive amount of messages sent to the system console. It's estimated that with 4096 processors in a system, and the console baudrate set to 56K, the startup messages will take about 84 minutes to clear the serial port. This set of patches limits the number of repetitious messages which contain no additional information. Much of this information is obtainable from the /proc and /sysfs. Some of the messages are also sent to the kernel log buffer as KERN_DEBUG messages so dmesg can be used to examine more closely any details specific to a problem. The new cpu bootup sequence for system_state == SYSTEM_BOOTING: Booting Node 0, Processors #1 #2 #3 #4 #5 #6 #7 Ok. Booting Node 1, Processors #8 #9 #10 #11 #12 #13 #14 #15 Ok. ... Booting Node 3, Processors #56 #57 #58 #59 #60 #61 #62 #63 Ok. Brought up 64 CPUs After the system is running, a single line boot message is displayed when CPU's are hotplugged on: Booting Node %d Processor %d APIC 0x%x Status of the following lines: CPU: Physical Processor ID: printed once (for boot cpu) CPU: Processor Core ID: printed once (for boot cpu) CPU: Hyper-Threading is disabled printed once (for boot cpu) CPU: Thermal monitoring enabled printed once (for boot cpu) CPU %d/0x%x -> Node %d: removed CPU %d is now offline: only if system_state == RUNNING Initializing CPU#%d: KERN_DEBUG Signed-off-by: Mike Travis LKML-Reference: <4B219E28.8080601@sgi.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/addon_cpuid_features.c | 15 ++++++---- arch/x86/kernel/cpu/amd.c | 2 -- arch/x86/kernel/cpu/common.c | 8 ++++-- arch/x86/kernel/cpu/intel.c | 2 -- arch/x86/kernel/cpu/mcheck/therm_throt.c | 4 +-- arch/x86/kernel/smpboot.c | 45 ++++++++++++++++++++---------- 6 files changed, 47 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index c965e5212714..468489b57aae 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -74,6 +74,7 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) unsigned int eax, ebx, ecx, edx, sub_index; unsigned int ht_mask_width, core_plus_mask_width; unsigned int core_select_mask, core_level_siblings; + static bool printed; if (c->cpuid_level < 0xb) return; @@ -127,12 +128,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) c->x86_max_cores = (core_level_siblings / smp_num_siblings); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); + if (!printed) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + printed = 1; + } return; #endif } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 7128b3799cec..8dc3ea145c97 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -375,8 +375,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) node = nearby_node(apicid); } numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1afa990a6c8..0ee9a3254eec 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -427,6 +427,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_HT u32 eax, ebx, ecx, edx; int index_msb, core_bits; + static bool printed; if (!cpu_has(c, X86_FEATURE_HT)) return; @@ -442,7 +443,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n"); goto out; } @@ -469,11 +470,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) ((1 << core_bits) - 1); out: - if ((c->x86_max_cores * smp_num_siblings) > 1) { + if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + printed = 1; } #endif } @@ -1115,7 +1117,7 @@ void __cpuinit cpu_init(void) if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) panic("CPU#%d already initialized!\n", cpu); - printk(KERN_INFO "Initializing CPU#%d\n", cpu); + pr_debug("Initializing CPU#%d\n", cpu); clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c900b73f9224..9c31e8b09d2c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -270,8 +270,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) node = cpu_to_node(cpu); } numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 4fef985fc221..1003ed4bbce4 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -339,8 +339,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c) l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); + printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n", + tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 29e6744f51e3..678d0b8c26f3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -671,6 +671,26 @@ static void __cpuinit do_fork_idle(struct work_struct *work) complete(&c_idle->done); } +/* reduce the number of lines printed when booting a large cpu count system */ +static void __cpuinit announce_cpu(int cpu, int apicid) +{ + static int current_node = -1; + int node = cpu_to_node(cpu); + + if (system_state == SYSTEM_BOOTING) { + if (node != current_node) { + if (current_node > (-1)) + pr_cont(" Ok.\n"); + current_node = node; + pr_info("Booting Node %3d, Processors ", node); + } + pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); + return; + } else + pr_info("Booting Node %d Processor %d APIC 0x%x\n", + node, cpu, apicid); +} + /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -737,9 +757,8 @@ do_rest: /* start_ip had better be page-aligned! */ start_ip = setup_trampoline(); - /* So we see what's up */ - printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", - cpu, apicid, start_ip); + /* So we see what's up */ + announce_cpu(cpu, apicid); /* * This grunge runs the startup process for @@ -788,21 +807,17 @@ do_rest: udelay(100); } - if (cpumask_test_cpu(cpu, cpu_callin_mask)) { - /* number CPUs logically, starting from 1 (BSP is 0) */ - pr_debug("OK.\n"); - printk(KERN_INFO "CPU%d: ", cpu); - print_cpu_info(&cpu_data(cpu)); - pr_debug("CPU has booted.\n"); - } else { + if (cpumask_test_cpu(cpu, cpu_callin_mask)) + pr_debug("CPU%d: has booted.\n", cpu); + else { boot_error = 1; if (*((volatile unsigned char *)trampoline_base) == 0xA5) /* trampoline started but...? */ - printk(KERN_ERR "Stuck ??\n"); + pr_err("CPU%d: Stuck ??\n", cpu); else /* trampoline code not run */ - printk(KERN_ERR "Not responding.\n"); + pr_err("CPU%d: Not responding.\n", cpu); if (apic->inquire_remote_apic) apic->inquire_remote_apic(apicid); } @@ -1293,14 +1308,16 @@ void native_cpu_die(unsigned int cpu) for (i = 0; i < 10; i++) { /* They ack this in play_dead by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { - printk(KERN_INFO "CPU %d is now offline\n", cpu); + if (system_state == SYSTEM_RUNNING) + pr_info("CPU %u is now offline\n", cpu); + if (1 == num_online_cpus()) alternatives_smp_switch(0); return; } msleep(100); } - printk(KERN_ERR "CPU %u didn't die...\n", cpu); + pr_err("CPU %u didn't die...\n", cpu); } void play_dead_common(void) -- cgit v1.2.1 From f4780ca005404166cc40af77ef0e86132ab98a81 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 14 Dec 2009 11:52:14 +0900 Subject: x86: Move swiotlb initialization before dma32_free_bootmem The commit 75f1cdf1dda92cae037ec848ae63690d91913eac introduced a bug that we initialize SWIOTLB right after dma32_free_bootmem so we wrongly steal memory area allocated for GART with broken BIOS earlier. This moves swiotlb initialization before dma32_free_bootmem(). Signed-off-by: FUJITA Tomonori Cc: yinghai@kernel.org LKML-Reference: <1260759135-6450-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index afcc58b69c7c..fcc2f2bfa39c 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -120,11 +120,14 @@ static void __init dma32_free_bootmem(void) void __init pci_iommu_alloc(void) { + int use_swiotlb; + + use_swiotlb = pci_swiotlb_init(); #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif - if (pci_swiotlb_init()) + if (use_swiotlb) return; gart_iommu_hole_init(); -- cgit v1.2.1 From f3eee54276dfd1117fd94259f2b4a38388264724 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 14 Dec 2009 11:52:15 +0900 Subject: x86: Gart: fix breakage due to IOMMU initialization cleanup This fixes the following breakage of the commit 75f1cdf1dda92cae037ec848ae63690d91913eac: - GART systems that don't AGP with broken BIOS and more than 4GB memory are forced to use swiotlb. They can allocate aperture by hand and use GART. - GART systems without GAP must disable GART on shutdown. - swiotlb usage is forced by the boot option, gart_iommu_hole_init() is not called, so we disable GART early_gart_iommu_check(). Signed-off-by: Yinghai Lu Signed-off-by: FUJITA Tomonori LKML-Reference: <1260759135-6450-3-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 11 ++++++----- arch/x86/kernel/pci-gart_64.c | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index e0dfb6856aa2..3704997e8b25 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -280,7 +280,8 @@ void __init early_gart_iommu_check(void) * or BIOS forget to put that in reserved. * try to update e820 to make that region as reserved. */ - int i, fix, slot; + u32 agp_aper_base = 0, agp_aper_order = 0; + int i, fix, slot, valid_agp = 0; u32 ctl; u32 aper_size = 0, aper_order = 0, last_aper_order = 0; u64 aper_base = 0, last_aper_base = 0; @@ -290,6 +291,8 @@ void __init early_gart_iommu_check(void) return; /* This is mostly duplicate of iommu_hole_init */ + agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + fix = 0; for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; @@ -342,10 +345,10 @@ void __init early_gart_iommu_check(void) } } - if (!fix) + if (valid_agp) return; - /* different nodes have different setting, disable them all at first*/ + /* disable them all at first */ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; int dev_base, dev_limit; @@ -458,8 +461,6 @@ out: if (aper_alloc) { /* Got the aperture from the AGP bridge */ - } else if (!valid_agp) { - /* Do nothing */ } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || force_iommu || valid_agp || diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index e6a0d402f171..56c0e730d3fe 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -710,7 +710,8 @@ static void gart_iommu_shutdown(void) struct pci_dev *dev; int i; - if (no_agp) + /* don't shutdown it if there is AGP installed */ + if (!no_agp) return; for (i = 0; i < num_k8_northbridges; i++) { -- cgit v1.2.1 From 485a2e1973fd9f98c2c6776e66ac4721882b69e0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 14 Dec 2009 17:56:34 +0900 Subject: x86, mce: Thermal monitoring depends on APIC being enabled Add check if APIC is not disabled since thermal monitoring depends on it. As only apic gets disabled we should not try to install "thermal monitor" vector, print out that thermal monitoring is enabled and etc... Note that "Intel Correct Machine Check Interrupts" already has such a check. Also I decided to not add cpu_has_apic check into mcheck_intel_therm_init since even if it'll call apic_read on disabled apic -- it's safe here and allow us to save a few code bytes. Reported-by: Thomas Gleixner Signed-off-by: Cyrill Gorcunov Signed-off-by: Hidetoshi Seto LKML-Reference: <4B25FDC2.3020401@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 1003ed4bbce4..0a9b57702be4 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -274,8 +274,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c) int tm2 = 0; u32 l, h; - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + /* Thermal monitoring depends on APIC, ACPI and clock modulation */ + if (!cpu_has_apic || !cpu_has(c, X86_FEATURE_ACPI) || + !cpu_has(c, X86_FEATURE_ACC)) return; /* -- cgit v1.2.1 From 70fe440718d9f42bf963c2cffe12008eb5556165 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Mon, 14 Dec 2009 17:57:00 +0900 Subject: x86, mce: Clean up thermal init by introducing intel_thermal_supported() It looks better to have a common function. No change in functionality. Signed-off-by: Hidetoshi Seto Cc: Cyrill Gorcunov LKML-Reference: <4B25FDDC.407@jp.fujitsu.com> Signed-off-by: Ingo Molnar Cc: Cyrill Gorcunov --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 0a9b57702be4..81c499eceb21 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -256,6 +256,16 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) ack_APIC_irq(); } +/* Thermal monitoring depends on APIC, ACPI and clock modulation */ +static int intel_thermal_supported(struct cpuinfo_x86 *c) +{ + if (!cpu_has_apic) + return 0; + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return 0; + return 1; +} + void __init mcheck_intel_therm_init(void) { /* @@ -263,8 +273,7 @@ void __init mcheck_intel_therm_init(void) * LVT value on BSP and use that value to restore APs' thermal LVT * entry BIOS programmed later */ - if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) && - cpu_has(&boot_cpu_data, X86_FEATURE_ACC)) + if (intel_thermal_supported(&boot_cpu_data)) lvtthmr_init = apic_read(APIC_LVTTHMR); } @@ -274,9 +283,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c) int tm2 = 0; u32 l, h; - /* Thermal monitoring depends on APIC, ACPI and clock modulation */ - if (!cpu_has_apic || !cpu_has(c, X86_FEATURE_ACPI) || - !cpu_has(c, X86_FEATURE_ACC)) + if (!intel_thermal_supported(c)) return; /* -- cgit v1.2.1 From 494c2ebfb287eb10b229415063099e3700639028 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 14 Dec 2009 10:02:18 -0800 Subject: x86, msr: Remove incorrect, duplicated code in the MSR driver The MSR driver would compute the values for cpu and c at declaration, and then again in the body of the function. This isn't merely redundant, but unsafe, since cpu might not refer to a valid CPU at that point. Remove the unnecessary and dangerous references in the declarations. This code now matches the equivalent code in the CPUID driver. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/msr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 553449951b84..572b07eee3f4 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -172,11 +172,10 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) static int msr_open(struct inode *inode, struct file *file) { - unsigned int cpu = iminor(file->f_path.dentry->d_inode); - struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int cpu; + struct cpuinfo_x86 *c; cpu = iminor(file->f_path.dentry->d_inode); - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; /* No such CPU */ -- cgit v1.2.1 From 873b5271f878a11729fb4602c6ce967d0ff81119 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 14 Dec 2009 13:55:20 -0800 Subject: x86: Regex support and known-movable symbols for relocs, fix _end This adds a new category of symbols to the relocs program: symbols which are known to be relative, even though the linker emits them as absolute; this is the case for symbols that live in the linker script, which currently applies to _end. Unfortunately the previous workaround of putting _end in its own empty section was defeated by newer binutils, which remove empty sections completely. This patch also changes the symbol matching to use regular expressions instead of hardcoded C for specific patterns. This is a decidedly non-minimal patch: a modified version of the relocs program is used as part of the Syslinux build, and this is basically a backport to Linux of some of those changes; they have thus been well tested. Signed-off-by: H. Peter Anvin LKML-Reference: <4AF86211.3070103@zytor.com> Acked-by: Michal Marek Tested-by: Sedat Dilek --- arch/x86/kernel/vmlinux.lds.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index f3f2104408d9..f92a0da608cb 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -319,9 +319,7 @@ SECTIONS __brk_limit = .; } - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = .; - } + _end = .; STABS_DEBUG DWARF_DEBUG -- cgit v1.2.1 From 445c89514be242b1b0080056d50bdc1b72adeb5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Dec 2009 19:49:50 +0100 Subject: locking: Convert raw_spinlock to arch_spinlock The raw_spin* namespace was taken by lockdep for the architecture specific implementations. raw_spin_* would be the ideal name space for the spinlocks which are not converted to sleeping locks in preempt-rt. Linus suggested to convert the raw_ to arch_ locks and cleanup the name space instead of using an artifical name like core_spin, atomic_spin or whatever No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: linux-arch@vger.kernel.org --- arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/paravirt-spinlocks.c | 2 +- arch/x86/kernel/tsc_sync.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index b8ce165dde5d..0862d9d89c92 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -188,7 +188,7 @@ void dump_stack(void) } EXPORT_SYMBOL(dump_stack); -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 3a7c5a44082e..a0f39e090684 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -8,7 +8,7 @@ #include static inline void -default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) +default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { __raw_spin_lock(lock); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index eed156851f5d..9f908b9d1abe 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count; * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata arch_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; -- cgit v1.2.1 From edc35bd72e2079b25f99c5da7d7a65dbbffc4a26 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Dec 2009 12:38:57 +0100 Subject: locking: Rename __RAW_SPIN_LOCK_UNLOCKED to __ARCH_SPIN_LOCK_UNLOCKED Further name space cleanup. No functional change Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: linux-arch@vger.kernel.org --- arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/tsc_sync.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 0862d9d89c92..5b75afac8a38 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -188,7 +188,7 @@ void dump_stack(void) } EXPORT_SYMBOL(dump_stack); -static arch_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9f908b9d1abe..f1714697a09a 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count; * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata arch_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; -- cgit v1.2.1 From 0199c4e68d1f02894bdefe4b5d9e9ee4aedd8d62 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Dec 2009 20:01:25 +0100 Subject: locking: Convert __raw_spin* functions to arch_spin* Name space cleanup. No functional change. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: David S. Miller Acked-by: Ingo Molnar Cc: linux-arch@vger.kernel.org --- arch/x86/kernel/dumpstack.c | 6 +++--- arch/x86/kernel/paravirt-spinlocks.c | 2 +- arch/x86/kernel/tsc_sync.c | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 5b75afac8a38..0a0aa1cec8f1 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -207,11 +207,11 @@ unsigned __kprobes long oops_begin(void) /* racy, but better than risking deadlock. */ raw_local_irq_save(flags); cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { + if (!arch_spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; else - __raw_spin_lock(&die_lock); + arch_spin_lock(&die_lock); } die_nest_count++; die_owner = cpu; @@ -231,7 +231,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) die_nest_count--; if (!die_nest_count) /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); + arch_spin_unlock(&die_lock); raw_local_irq_restore(flags); oops_exit(); diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index a0f39e090684..676b8c77a976 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -10,7 +10,7 @@ static inline void default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { - __raw_spin_lock(lock); + arch_spin_lock(lock); } struct pv_lock_ops pv_lock_ops = { diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index f1714697a09a..0aa5fed8b9e6 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -62,13 +62,13 @@ static __cpuinit void check_tsc_warp(void) * previous TSC that was measured (possibly on * another CPU) and update the previous TSC timestamp. */ - __raw_spin_lock(&sync_lock); + arch_spin_lock(&sync_lock); prev = last_tsc; rdtsc_barrier(); now = get_cycles(); rdtsc_barrier(); last_tsc = now; - __raw_spin_unlock(&sync_lock); + arch_spin_unlock(&sync_lock); /* * Be nice every now and then (and also check whether @@ -87,10 +87,10 @@ static __cpuinit void check_tsc_warp(void) * we saw a time-warp of the TSC going backwards: */ if (unlikely(prev > now)) { - __raw_spin_lock(&sync_lock); + arch_spin_lock(&sync_lock); max_warp = max(max_warp, prev - now); nr_warps++; - __raw_spin_unlock(&sync_lock); + arch_spin_unlock(&sync_lock); } } WARN(!(now-start), -- cgit v1.2.1 From 239007b8440abff689632f50cdf0f2b9e895b534 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2009 16:46:45 +0100 Subject: genirq: Convert irq_desc.lock to raw_spinlock Convert locks which cannot be sleeping locks in preempt-rt to raw_spinlocks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/irq.c | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d5d498fbee4b..11a5851f1f50 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2431,7 +2431,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) continue; cfg = irq_cfg(irq); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; @@ -2450,7 +2450,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) } __get_cpu_var(vector_irq)[vector] = -1; unlock: - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } irq_exit(); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 664bcb7384ac..91fd0c70a18a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -149,7 +149,7 @@ int show_interrupts(struct seq_file *p, void *v) if (!desc) return 0; - spin_lock_irqsave(&desc->lock, flags); + raw_spin_lock_irqsave(&desc->lock, flags); for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; @@ -170,7 +170,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); out: - spin_unlock_irqrestore(&desc->lock, flags); + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -294,12 +294,12 @@ void fixup_irqs(void) continue; /* interrupt's are disabled at this point */ - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); affinity = desc->affinity; if (!irq_has_action(irq) || cpumask_equal(affinity, cpu_online_mask)) { - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); continue; } @@ -326,7 +326,7 @@ void fixup_irqs(void) if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) desc->chip->unmask(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); if (break_affinity && set_affinity) printk("Broke affinity for irq %i\n", irq); @@ -356,10 +356,10 @@ void fixup_irqs(void) irq = __get_cpu_var(vector_irq)[vector]; desc = irq_to_desc(irq); - spin_lock(&desc->lock); + raw_spin_lock(&desc->lock); if (desc->chip->retrigger) desc->chip->retrigger(irq); - spin_unlock(&desc->lock); + raw_spin_unlock(&desc->lock); } } } -- cgit v1.2.1 From 03a05ed1152944000151d57b71000de287a1eb02 Mon Sep 17 00:00:00 2001 From: Zhao Yakui Date: Fri, 11 Dec 2009 15:17:20 +0800 Subject: ACPI: Use the ARB_DISABLE for the CPU which model id is less than 0x0f. Currently, ARB_DISABLE is a NOP on all of the recent Intel platforms. For such platforms, reduce contention on c3_lock by skipping the fake ARB_DISABLE. The cpu model id on one laptop is 14. If we disable ARB_DISABLE on this box, the box can't be booted correctly. But if we still enable ARB_DISABLE on this box, the box can be booted correctly. So we still use the ARB_DISABLE for the cpu which mode id is less than 0x0f. http://bugzilla.kernel.org/show_bug.cgi?id=14700 Signed-off-by: Zhao Yakui Acked-by: Pallipadi, Venkatesh cc: stable@kernel.org Signed-off-by: Len Brown --- arch/x86/kernel/acpi/cstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 59cdfa4686b2..2e837f5080fe 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, * P4, Core and beyond CPUs */ if (c->x86_vendor == X86_VENDOR_INTEL && - (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); -- cgit v1.2.1 From e6428047725d72d63c1d9c4ba852e635e3ffe52a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 15 Dec 2009 16:28:13 -0600 Subject: x86: don't export inline function For CONFIG_PARAVIRT, load_gs_index is an inline function (it's #defined to native_load_gs_index otherwise). Exporting an inline function breaks the new assembler-based alphabetical sorted symbol list: Today's linux-next build (x86_64 allmodconfig) failed like this: .tmp_exports-asm.o: In function `__ksymtab_load_gs_index': (__ksymtab_sorted+0x5b40): undefined reference to `load_gs_index' Signed-off-by: Rusty Russell To: x86@kernel.org Cc: alan-jenkins@tuffmail.co.uk --- arch/x86/kernel/x8664_ksyms_64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index a1029769b6f2..084c1adc45f5 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -56,4 +56,6 @@ EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(init_level4_pgt); -EXPORT_SYMBOL(load_gs_index); +#ifndef CONFIG_PARAVIRT +EXPORT_SYMBOL(native_load_gs_index); +#endif -- cgit v1.2.1 From 186a25026c44d1bfa97671110ff14dcd0c99678e Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 15 Dec 2009 20:47:56 +0900 Subject: x86: Split swiotlb initialization into two stages The commit f4780ca005404166cc40af77ef0e86132ab98a81 moves swiotlb initialization before dma32_free_bootmem(). It's supposed to fix a bug that the commit 75f1cdf1dda92cae037ec848ae63690d91913eac introduced, we initialize SWIOTLB right after dma32_free_bootmem so we wrongly steal memory area allocated for GART with broken BIOS earlier. However, the above commit introduced another problem, which likely breaks machines with huge amount of memory. Such a box use the majority of DMA32_ZONE so there is no memory for swiotlb. With this patch, the x86 IOMMU initialization sequence are: 1. We set swiotlb to 1 in the case of (max_pfn > MAX_DMA32_PFN && !no_iommu). If swiotlb usage is forced by the boot option, we go to the step 3 and finish (we don't try to detect IOMMUs). 2. We call the detection functions of all the IOMMUs. The detection function sets x86_init.iommu.iommu_init to the IOMMU initialization function (so we can avoid calling the initialization functions of all the IOMMUs needlessly). 3. We initialize swiotlb (and set dma_ops to swiotlb_dma_ops) if swiotlb is set to 1. 4. If the IOMMU initialization function doesn't need swiotlb (e.g. the initialization is sucessful) then sets swiotlb to zero. 5. If we find that swiotlb is set to zero, we free swiotlb resource. Reported-by: Yinghai Lu Reported-by: Roland Dreier Signed-off-by: FUJITA Tomonori LKML-Reference: <20091215204729A.fujita.tomonori@lab.ntt.co.jp> Tested-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 9 ++++----- arch/x86/kernel/pci-swiotlb.c | 11 +++++++---- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index fcc2f2bfa39c..75e14e21f61a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -120,15 +120,12 @@ static void __init dma32_free_bootmem(void) void __init pci_iommu_alloc(void) { - int use_swiotlb; - - use_swiotlb = pci_swiotlb_init(); #ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); #endif - if (use_swiotlb) - return; + if (pci_swiotlb_detect()) + goto out; gart_iommu_hole_init(); @@ -138,6 +135,8 @@ void __init pci_iommu_alloc(void) /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); +out: + pci_swiotlb_init(); } void *dma_generic_alloc_coherent(struct device *dev, size_t size, diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index e3c0a66b9e77..7d2829dde20e 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -43,12 +43,12 @@ static struct dma_map_ops swiotlb_dma_ops = { }; /* - * pci_swiotlb_init - initialize swiotlb if necessary + * pci_swiotlb_detect - set swiotlb to 1 if necessary * * This returns non-zero if we are forced to use swiotlb (by the boot * option). */ -int __init pci_swiotlb_init(void) +int __init pci_swiotlb_detect(void) { int use_swiotlb = swiotlb | swiotlb_force; @@ -60,10 +60,13 @@ int __init pci_swiotlb_init(void) if (swiotlb_force) swiotlb = 1; + return use_swiotlb; +} + +void __init pci_swiotlb_init(void) +{ if (swiotlb) { swiotlb_init(0); dma_ops = &swiotlb_dma_ops; } - - return use_swiotlb; } -- cgit v1.2.1 From 2e8c12436f540d3c40137ebf10268803dc972f6a Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 14 Dec 2009 18:00:39 -0800 Subject: cs5535: move the DIVIL MSR definition into linux/cs5535.h The only thing that uses this is the reboot_fixups code. Signed-off-by: Andres Salomon Cc: Jordan Crouse Cc: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Cc: Chris Ball Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/reboot_fixups_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index 201eab63b05f..fda313ebbb03 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include static void cs5530a_warm_reset(struct pci_dev *dev) { -- cgit v1.2.1 From f060f27007b393bac6e50ee6fc26d8505acf6fe4 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 14 Dec 2009 18:00:40 -0800 Subject: cs5535: move VSA2 checks into linux/cs5535.h Signed-off-by: Andres Salomon Cc: Jordan Crouse Cc: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Cc: Chris Ball Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/geode_32.c | 22 ---------------------- arch/x86/kernel/olpc.c | 4 ++-- 2 files changed, 2 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c index 9b08e852fd1a..9dad6ca6cd70 100644 --- a/arch/x86/kernel/geode_32.c +++ b/arch/x86/kernel/geode_32.c @@ -161,28 +161,6 @@ void geode_gpio_setup_event(unsigned int gpio, int pair, int pme) } EXPORT_SYMBOL_GPL(geode_gpio_setup_event); -int geode_has_vsa2(void) -{ - static int has_vsa2 = -1; - - if (has_vsa2 == -1) { - u16 val; - - /* - * The VSA has virtual registers that we can query for a - * signature. - */ - outw(VSA_VR_UNLOCK, VSA_VRC_INDEX); - outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX); - - val = inw(VSA_VRC_DATA); - has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG); - } - - return has_vsa2; -} -EXPORT_SYMBOL_GPL(geode_has_vsa2); - static int __init geode_southbridge_init(void) { if (!is_geode()) diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 4006c522adc7..9d1d263f786f 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -212,7 +212,7 @@ static int __init olpc_init(void) unsigned char *romsig; /* The ioremap check is dangerous; limit what we run it on */ - if (!is_geode() || geode_has_vsa2()) + if (!is_geode() || cs5535_has_vsa2()) return 0; spin_lock_init(&ec_lock); @@ -244,7 +244,7 @@ static int __init olpc_init(void) (unsigned char *) &olpc_platform_info.ecver, 1); /* check to see if the VSA exists */ - if (geode_has_vsa2()) + if (cs5535_has_vsa2()) olpc_platform_info.flags |= OLPC_F_VSA; printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", -- cgit v1.2.1 From c95d1e53ed89b75a4d7b68d1cbae4607b1479243 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 14 Dec 2009 18:00:41 -0800 Subject: cs5535: drop the Geode-specific MFGPT/GPIO code With generic modular drivers handling all of this stuff, the geode-specific code can go away. The cs5535-gpio, cs5535-mfgpt, and cs5535-clockevt drivers now handle this. Signed-off-by: Andres Salomon Cc: Jordan Crouse Cc: Ingo Molnar Cc: Thomas Gleixner Cc: john stultz Cc: Chris Ball Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/geode_32.c | 174 ------------------- arch/x86/kernel/mfgpt_32.c | 410 --------------------------------------------- 3 files changed, 585 deletions(-) delete mode 100644 arch/x86/kernel/geode_32.c delete mode 100644 arch/x86/kernel/mfgpt_32.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4f2e66e29ecc..d87f09bc5a52 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -89,7 +89,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c deleted file mode 100644 index 9dad6ca6cd70..000000000000 --- a/arch/x86/kernel/geode_32.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * AMD Geode southbridge support code - * Copyright (C) 2006, Advanced Micro Devices, Inc. - * Copyright (C) 2007, Andres Salomon - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include - -static struct { - char *name; - u32 msr; - int size; - u32 base; -} lbars[] = { - { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 }, - { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 }, - { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 }, - { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 } -}; - -static void __init init_lbars(void) -{ - u32 lo, hi; - int i; - - for (i = 0; i < ARRAY_SIZE(lbars); i++) { - rdmsr(lbars[i].msr, lo, hi); - if (hi & 0x01) - lbars[i].base = lo & 0x0000ffff; - - if (lbars[i].base == 0) - printk(KERN_ERR "geode: Couldn't initialize '%s'\n", - lbars[i].name); - } -} - -int geode_get_dev_base(unsigned int dev) -{ - BUG_ON(dev >= ARRAY_SIZE(lbars)); - return lbars[dev].base; -} -EXPORT_SYMBOL_GPL(geode_get_dev_base); - -/* === GPIO API === */ - -void geode_gpio_set(u32 gpio, unsigned int reg) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - - if (!base) - return; - - /* low bank register */ - if (gpio & 0xFFFF) - outl(gpio & 0xFFFF, base + reg); - /* high bank register */ - gpio >>= 16; - if (gpio) - outl(gpio, base + 0x80 + reg); -} -EXPORT_SYMBOL_GPL(geode_gpio_set); - -void geode_gpio_clear(u32 gpio, unsigned int reg) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - - if (!base) - return; - - /* low bank register */ - if (gpio & 0xFFFF) - outl((gpio & 0xFFFF) << 16, base + reg); - /* high bank register */ - gpio &= (0xFFFF << 16); - if (gpio) - outl(gpio, base + 0x80 + reg); -} -EXPORT_SYMBOL_GPL(geode_gpio_clear); - -int geode_gpio_isset(u32 gpio, unsigned int reg) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - u32 val; - - if (!base) - return 0; - - /* low bank register */ - if (gpio & 0xFFFF) { - val = inl(base + reg) & (gpio & 0xFFFF); - if ((gpio & 0xFFFF) == val) - return 1; - } - /* high bank register */ - gpio >>= 16; - if (gpio) { - val = inl(base + 0x80 + reg) & gpio; - if (gpio == val) - return 1; - } - return 0; -} -EXPORT_SYMBOL_GPL(geode_gpio_isset); - -void geode_gpio_set_irq(unsigned int group, unsigned int irq) -{ - u32 lo, hi; - - if (group > 7 || irq > 15) - return; - - rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi); - - lo &= ~(0xF << (group * 4)); - lo |= (irq & 0xF) << (group * 4); - - wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi); -} -EXPORT_SYMBOL_GPL(geode_gpio_set_irq); - -void geode_gpio_setup_event(unsigned int gpio, int pair, int pme) -{ - u32 base = geode_get_dev_base(GEODE_DEV_GPIO); - u32 offset, shift, val; - - if (gpio >= 24) - offset = GPIO_MAP_W; - else if (gpio >= 16) - offset = GPIO_MAP_Z; - else if (gpio >= 8) - offset = GPIO_MAP_Y; - else - offset = GPIO_MAP_X; - - shift = (gpio % 8) * 4; - - val = inl(base + offset); - - /* Clear whatever was there before */ - val &= ~(0xF << shift); - - /* And set the new value */ - - val |= ((pair & 7) << shift); - - /* Set the PME bit if this is a PME event */ - - if (pme) - val |= (1 << (shift + 3)); - - outl(val, base + offset); -} -EXPORT_SYMBOL_GPL(geode_gpio_setup_event); - -static int __init geode_southbridge_init(void) -{ - if (!is_geode()) - return -ENODEV; - - init_lbars(); - (void) mfgpt_timer_setup(); - return 0; -} - -postcore_initcall(geode_southbridge_init); diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c deleted file mode 100644 index 2a62d843f015..000000000000 --- a/arch/x86/kernel/mfgpt_32.c +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Driver/API for AMD Geode Multi-Function General Purpose Timers (MFGPT) - * - * Copyright (C) 2006, Advanced Micro Devices, Inc. - * Copyright (C) 2007, Andres Salomon - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * The MFGPTs are documented in AMD Geode CS5536 Companion Device Data Book. - */ - -/* - * We are using the 32.768kHz input clock - it's the only one that has the - * ranges we find desirable. The following table lists the suitable - * divisors and the associated Hz, minimum interval and the maximum interval: - * - * Divisor Hz Min Delta (s) Max Delta (s) - * 1 32768 .00048828125 2.000 - * 2 16384 .0009765625 4.000 - * 4 8192 .001953125 8.000 - * 8 4096 .00390625 16.000 - * 16 2048 .0078125 32.000 - * 32 1024 .015625 64.000 - * 64 512 .03125 128.000 - * 128 256 .0625 256.000 - * 256 128 .125 512.000 - */ - -#include -#include -#include -#include - -#define MFGPT_DEFAULT_IRQ 7 - -static struct mfgpt_timer_t { - unsigned int avail:1; -} mfgpt_timers[MFGPT_MAX_TIMERS]; - -/* Selected from the table above */ - -#define MFGPT_DIVISOR 16 -#define MFGPT_SCALE 4 /* divisor = 2^(scale) */ -#define MFGPT_HZ (32768 / MFGPT_DIVISOR) -#define MFGPT_PERIODIC (MFGPT_HZ / HZ) - -/* Allow for disabling of MFGPTs */ -static int disable; -static int __init mfgpt_disable(char *s) -{ - disable = 1; - return 1; -} -__setup("nomfgpt", mfgpt_disable); - -/* Reset the MFGPT timers. This is required by some broken BIOSes which already - * do the same and leave the system in an unstable state. TinyBIOS 0.98 is - * affected at least (0.99 is OK with MFGPT workaround left to off). - */ -static int __init mfgpt_fix(char *s) -{ - u32 val, dummy; - - /* The following udocumented bit resets the MFGPT timers */ - val = 0xFF; dummy = 0; - wrmsr(MSR_MFGPT_SETUP, val, dummy); - return 1; -} -__setup("mfgptfix", mfgpt_fix); - -/* - * Check whether any MFGPTs are available for the kernel to use. In most - * cases, firmware that uses AMD's VSA code will claim all timers during - * bootup; we certainly don't want to take them if they're already in use. - * In other cases (such as with VSAless OpenFirmware), the system firmware - * leaves timers available for us to use. - */ - - -static int timers = -1; - -static void geode_mfgpt_detect(void) -{ - int i; - u16 val; - - timers = 0; - - if (disable) { - printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n"); - goto done; - } - - if (!geode_get_dev_base(GEODE_DEV_MFGPT)) { - printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n"); - goto done; - } - - for (i = 0; i < MFGPT_MAX_TIMERS; i++) { - val = geode_mfgpt_read(i, MFGPT_REG_SETUP); - if (!(val & MFGPT_SETUP_SETUP)) { - mfgpt_timers[i].avail = 1; - timers++; - } - } - -done: - printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers); -} - -int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) -{ - u32 msr, mask, value, dummy; - int shift = (cmp == MFGPT_CMP1) ? 0 : 8; - - if (timer < 0 || timer >= MFGPT_MAX_TIMERS) - return -EIO; - - /* - * The register maps for these are described in sections 6.17.1.x of - * the AMD Geode CS5536 Companion Device Data Book. - */ - switch (event) { - case MFGPT_EVENT_RESET: - /* - * XXX: According to the docs, we cannot reset timers above - * 6; that is, resets for 7 and 8 will be ignored. Is this - * a problem? -dilinger - */ - msr = MSR_MFGPT_NR; - mask = 1 << (timer + 24); - break; - - case MFGPT_EVENT_NMI: - msr = MSR_MFGPT_NR; - mask = 1 << (timer + shift); - break; - - case MFGPT_EVENT_IRQ: - msr = MSR_MFGPT_IRQ; - mask = 1 << (timer + shift); - break; - - default: - return -EIO; - } - - rdmsr(msr, value, dummy); - - if (enable) - value |= mask; - else - value &= ~mask; - - wrmsr(msr, value, dummy); - return 0; -} -EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); - -int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable) -{ - u32 zsel, lpc, dummy; - int shift; - - if (timer < 0 || timer >= MFGPT_MAX_TIMERS) - return -EIO; - - /* - * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA - * is using the same CMP of the timer's Siamese twin, the IRQ is set to - * 2, and we mustn't use nor change it. - * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the - * IRQ of the 1st. This can only happen if forcing an IRQ, calling this - * with *irq==0 is safe. Currently there _are_ no 2 drivers. - */ - rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); - shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4; - if (((zsel >> shift) & 0xF) == 2) - return -EIO; - - /* Choose IRQ: if none supplied, keep IRQ already set or use default */ - if (!*irq) - *irq = (zsel >> shift) & 0xF; - if (!*irq) - *irq = MFGPT_DEFAULT_IRQ; - - /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */ - if (*irq < 1 || *irq == 2 || *irq > 15) - return -EIO; - rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy); - if (lpc & (1 << *irq)) - return -EIO; - - /* All chosen and checked - go for it */ - if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) - return -EIO; - if (enable) { - zsel = (zsel & ~(0xF << shift)) | (*irq << shift); - wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); - } - - return 0; -} - -static int mfgpt_get(int timer) -{ - mfgpt_timers[timer].avail = 0; - printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer); - return timer; -} - -int geode_mfgpt_alloc_timer(int timer, int domain) -{ - int i; - - if (timers == -1) { - /* timers haven't been detected yet */ - geode_mfgpt_detect(); - } - - if (!timers) - return -1; - - if (timer >= MFGPT_MAX_TIMERS) - return -1; - - if (timer < 0) { - /* Try to find an available timer */ - for (i = 0; i < MFGPT_MAX_TIMERS; i++) { - if (mfgpt_timers[i].avail) - return mfgpt_get(i); - - if (i == 5 && domain == MFGPT_DOMAIN_WORKING) - break; - } - } else { - /* If they requested a specific timer, try to honor that */ - if (mfgpt_timers[timer].avail) - return mfgpt_get(timer); - } - - /* No timers available - too bad */ - return -1; -} -EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer); - - -#ifdef CONFIG_GEODE_MFGPT_TIMER - -/* - * The MFPGT timers on the CS5536 provide us with suitable timers to use - * as clock event sources - not as good as a HPET or APIC, but certainly - * better than the PIT. This isn't a general purpose MFGPT driver, but - * a simplified one designed specifically to act as a clock event source. - * For full details about the MFGPT, please consult the CS5536 data sheet. - */ - -#include -#include - -static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; -static u16 mfgpt_event_clock; - -static int irq; -static int __init mfgpt_setup(char *str) -{ - get_option(&str, &irq); - return 1; -} -__setup("mfgpt_irq=", mfgpt_setup); - -static void mfgpt_disable_timer(u16 clock) -{ - /* avoid races by clearing CMP1 and CMP2 unconditionally */ - geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN | - MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2); -} - -static int mfgpt_next_event(unsigned long, struct clock_event_device *); -static void mfgpt_set_mode(enum clock_event_mode, struct clock_event_device *); - -static struct clock_event_device mfgpt_clockevent = { - .name = "mfgpt-timer", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = mfgpt_set_mode, - .set_next_event = mfgpt_next_event, - .rating = 250, - .cpumask = cpu_all_mask, - .shift = 32 -}; - -static void mfgpt_start_timer(u16 delta) -{ - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta); - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0); - - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, - MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2); -} - -static void mfgpt_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - mfgpt_disable_timer(mfgpt_event_clock); - - if (mode == CLOCK_EVT_MODE_PERIODIC) - mfgpt_start_timer(MFGPT_PERIODIC); - - mfgpt_tick_mode = mode; -} - -static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt) -{ - mfgpt_start_timer(delta); - return 0; -} - -static irqreturn_t mfgpt_tick(int irq, void *dev_id) -{ - u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP); - - /* See if the interrupt was for us */ - if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1))) - return IRQ_NONE; - - /* Turn off the clock (and clear the event) */ - mfgpt_disable_timer(mfgpt_event_clock); - - if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN) - return IRQ_HANDLED; - - /* Clear the counter */ - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0); - - /* Restart the clock in periodic mode */ - - if (mfgpt_tick_mode == CLOCK_EVT_MODE_PERIODIC) { - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, - MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2); - } - - mfgpt_clockevent.event_handler(&mfgpt_clockevent); - return IRQ_HANDLED; -} - -static struct irqaction mfgptirq = { - .handler = mfgpt_tick, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, - .name = "mfgpt-timer" -}; - -int __init mfgpt_timer_setup(void) -{ - int timer, ret; - u16 val; - - timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING); - if (timer < 0) { - printk(KERN_ERR - "mfgpt-timer: Could not allocate a MFPGT timer\n"); - return -ENODEV; - } - - mfgpt_event_clock = timer; - - /* Set up the IRQ on the MFGPT side */ - if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) { - printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); - return -EIO; - } - - /* And register it with the kernel */ - ret = setup_irq(irq, &mfgptirq); - - if (ret) { - printk(KERN_ERR - "mfgpt-timer: Unable to set up the interrupt.\n"); - goto err; - } - - /* Set the clock scale and enable the event mode for CMP2 */ - val = MFGPT_SCALE | (3 << 8); - - geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val); - - /* Set up the clock event */ - mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC, - mfgpt_clockevent.shift); - mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF, - &mfgpt_clockevent); - mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE, - &mfgpt_clockevent); - - printk(KERN_INFO - "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n", - timer, irq); - clockevents_register_device(&mfgpt_clockevent); - - return 0; - -err: - geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq); - printk(KERN_ERR - "mfgpt-timer: Unable to set up the MFGPT clock source\n"); - return -EIO; -} - -#endif -- cgit v1.2.1 From e7d2860b690d4f3bed6824757c540579638e3d1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= Date: Mon, 14 Dec 2009 18:01:06 -0800 Subject: tree-wide: convert open calls to remove spaces to skip_spaces() lib function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes use of skip_spaces() defined in lib/string.c for removing leading spaces from strings all over the tree. It decreases lib.a code size by 47 bytes and reuses the function tree-wide: text data bss dec hex filename 64688 584 592 65864 10148 (TOTALS-BEFORE) 64641 584 592 65817 10119 (TOTALS-AFTER) Also, while at it, if we see (*str && isspace(*str)), we can be sure to remove the first condition (*str) as the second one (isspace(*str)) also evaluates to 0 whenever *str == 0, making it redundant. In other words, "a char equals zero is never a space". Julia Lawall tried the semantic patch (http://coccinelle.lip6.fr) below, and found occurrences of this pattern on 3 more files: drivers/leds/led-class.c drivers/leds/ledtrig-timer.c drivers/video/output.c @@ expression str; @@ ( // ignore skip_spaces cases while (*str && isspace(*str)) { \(str++;\|++str;\) } | - *str && isspace(*str) ) Signed-off-by: André Goddard Rosa Cc: Julia Lawall Cc: Martin Schwidefsky Cc: Jeff Dike Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Richard Purdie Cc: Neil Brown Cc: Kyle McMartin Cc: Henrique de Moraes Holschuh Cc: David Howells Cc: Cc: Samuel Ortiz Cc: Patrick McHardy Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mtrr/if.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 3c1b12d461d1..e006e56f699c 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #define LINE_SIZE 80 @@ -133,8 +134,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return -EINVAL; base = simple_strtoull(line + 5, &ptr, 0); - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr); if (strncmp(ptr, "size=", 5)) return -EINVAL; @@ -142,14 +142,11 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr); if (strncmp(ptr, "type=", 5)) return -EINVAL; - ptr += 5; - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr + 5); for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) -- cgit v1.2.1 From 0b962d473af32ec334df271b54ff4973cb2b4c73 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 15 Dec 2009 15:13:07 -0800 Subject: x86, msr/cpuid: Register enough minors for the MSR and CPUID drivers register_chrdev() hardcodes registering 256 minors, presumably to avoid breaking old drivers. However, we need to register enough minors so that we have all possible CPUs. checkpatch warns on this patch, but the patch is correct: NR_CPUS here is a static *upper bound* on the *maximum CPU index* (not *number of CPUs!*) and that is what we want. Reported-and-tested-by: Russ Anderson Cc: Tejun Heo Cc: Alan Cox Cc: Takashi Iwai Cc: Alexander Viro Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/kernel/cpuid.c | 5 +++-- arch/x86/kernel/msr.c | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 7ef24a796992..cb27fd6136c9 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -187,7 +187,8 @@ static int __init cpuid_init(void) int i, err = 0; i = 0; - if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { + if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, + "cpu/cpuid", &cpuid_fops)) { printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", CPUID_MAJOR); err = -EBUSY; @@ -216,7 +217,7 @@ out_class: } class_destroy(cpuid_class); out_chrdev: - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); out: return err; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 572b07eee3f4..4bd93c9b2b27 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -246,7 +246,7 @@ static int __init msr_init(void) int i, err = 0; i = 0; - if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { + if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { printk(KERN_ERR "msr: unable to get major %d for msr\n", MSR_MAJOR); err = -EBUSY; @@ -274,7 +274,7 @@ out_class: msr_device_destroy(i); class_destroy(msr_class); out_chrdev: - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); out: return err; } -- cgit v1.2.1 From 7f38551fc3ff0e17a38d6f3f0f8831380a88f3cc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:20 -0800 Subject: ptrace: x86: implement user_single_step_siginfo() Suggested by Roland. Implement user_single_step_siginfo() for x86. Extract this code from send_sigtrap(). Since x86 calls tracehook_report_syscall_exit(step => 0) the new helper is not used yet. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7079ddaf0731..77b60085a810 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1676,21 +1676,33 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, - int error_code, int si_code) +static void fill_sigtrap_info(struct task_struct *tsk, + struct pt_regs *regs, + int error_code, int si_code, + struct siginfo *info) { - struct siginfo info; - tsk->thread.trap_no = 1; tsk->thread.error_code = error_code; - memset(&info, 0, sizeof(info)); - info.si_signo = SIGTRAP; - info.si_code = si_code; + memset(info, 0, sizeof(*info)); + info->si_signo = SIGTRAP; + info->si_code = si_code; + info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; +} + +void user_single_step_siginfo(struct task_struct *tsk, + struct pt_regs *regs, + struct siginfo *info) +{ + fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info); +} - /* User-mode ip? */ - info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code) +{ + struct siginfo info; + fill_sigtrap_info(tsk, regs, error_code, si_code, &info); /* Send us the fake SIGTRAP */ force_sig_info(SIGTRAP, &info, tsk); } -- cgit v1.2.1 From d51965037325e51f6cd68583413243c3573e47b0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Dec 2009 16:47:21 -0800 Subject: ptrace: x86: change syscall_trace_leave() to rely on tracehook when stepping Suggested by Roland. Unlike powepc, x86 always calls tracehook_report_syscall_exit(step) with step = 0, and sends the trap by hand. This results in unnecessary SIGTRAP when PTRACE_SINGLESTEP follows the syscall-exit stop. Change syscall_trace_leave() to pass the correct "step" argument to tracehook and remove the send_sigtrap() logic. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 77b60085a810..2779321046bd 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1767,29 +1767,22 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) asmregparm void syscall_trace_leave(struct pt_regs *regs) { + bool step; + if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, 0); - /* * If TIF_SYSCALL_EMU is set, we only get here because of * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). * We already reported this syscall instruction in - * syscall_trace_enter(), so don't do any more now. - */ - if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) - return; - - /* - * If we are single-stepping, synthesize a trap to follow the - * system call instruction. + * syscall_trace_enter(). */ - if (test_thread_flag(TIF_SINGLESTEP) && - tracehook_consider_fatal_signal(current, SIGTRAP)) - send_sigtrap(current, regs, 0, TRAP_BRKPT); + step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && + !test_thread_flag(TIF_SYSCALL_EMU); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, step); } -- cgit v1.2.1 From c2c9f115741453715d6b4da1cd2de65af8c7ad86 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 15 Dec 2009 16:47:56 -0800 Subject: x86: uv: update XPC to handle updated BIOS interface The UV BIOS has moved the location of some of their pointers to the "partition reserved page" from memory into a uv hub MMR. The GRU does not support bcopy operations from MMR space so we need to special case the MMR addresses using VLOAD operations. Additionally, the BIOS call for registering a message queue watchlist has removed the 'blade' value and eliminated the structure that was being passed in. This is also reflected in this patch. Signed-off-by: Robin Holt Cc: Jack Steiner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/bios_uv.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 63a88e1f987d..b0206a211b09 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -101,21 +101,17 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, } int -uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, +uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, unsigned long *intr_mmr_offset) { - union uv_watchlist_u size_blade; u64 watchlist; s64 ret; - size_blade.size = mq_size; - size_blade.blade = blade; - /* * bios returns watchlist number or negative error number. */ ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, - size_blade.val, (u64)intr_mmr_offset, + mq_size, (u64)intr_mmr_offset, (u64)&watchlist, 0); if (ret < BIOS_STATUS_SUCCESS) return ret; -- cgit v1.2.1 From a66022c457755b5eef61e30866114679c95e1f54 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 15 Dec 2009 16:48:28 -0800 Subject: iommu-helper: use bitmap library Use bitmap library and kill some unused iommu helper functions. 1. s/iommu_area_free/bitmap_clear/ 2. s/iommu_area_reserve/bitmap_set/ 3. Use bitmap_find_next_zero_area instead of find_next_zero_area This cannot be simple substitution because find_next_zero_area doesn't check the last bit of the limit in bitmap 4. Remove iommu_area_free, iommu_area_reserve, and find_next_zero_area Signed-off-by: Akinobu Mita Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: FUJITA Tomonori Cc: Joerg Roedel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/amd_iommu.c | 4 ++-- arch/x86/kernel/pci-calgary_64.c | 6 +++--- arch/x86/kernel/pci-gart_64.c | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index b990b5cc9541..23824fef789c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include @@ -1162,7 +1162,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; - iommu_area_free(range->bitmap, address, pages); + bitmap_clear(range->bitmap, address, pages); } diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index c563e4c8ff39..2bbde6078143 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include @@ -212,7 +212,7 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_lock_irqsave(&tbl->it_lock, flags); - iommu_area_reserve(tbl->it_map, index, npages); + bitmap_set(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } @@ -303,7 +303,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_lock_irqsave(&tbl->it_lock, flags); - iommu_area_free(tbl->it_map, entry, npages); + bitmap_clear(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 56c0e730d3fe..34de53b46f87 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -126,7 +126,7 @@ static void free_iommu(unsigned long offset, int size) unsigned long flags; spin_lock_irqsave(&iommu_bitmap_lock, flags); - iommu_area_free(iommu_gart_bitmap, offset, size); + bitmap_clear(iommu_gart_bitmap, offset, size); if (offset >= next_bit) next_bit = offset + size; spin_unlock_irqrestore(&iommu_bitmap_lock, flags); @@ -792,7 +792,7 @@ int __init gart_iommu_init(void) * Out of IOMMU space handling. * Reserve some invalid pages at the beginning of the GART. */ - iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES); pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", iommu_size >> 20); -- cgit v1.2.1 From 9d260ebc09a0ad6b5c73e17676df42c7bc75ff64 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 16 Dec 2009 15:43:55 +0100 Subject: x86, amd: Get multi-node CPU info from NodeId MSR instead of PCI config space Use NodeId MSR to get NodeId and number of nodes per processor. Signed-off-by: Andreas Herrmann LKML-Reference: <20091216144355.GB28798@alberich.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 53 ++++++++++++++--------------------------------- 1 file changed, 15 insertions(+), 38 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8dc3ea145c97..e485825130d2 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -254,59 +254,36 @@ static int __cpuinit nearby_node(int apicid) /* * Fixup core topology information for AMD multi-node processors. - * Assumption 1: Number of cores in each internal node is the same. - * Assumption 2: Mixed systems with both single-node and dual-node - * processors are not supported. + * Assumption: Number of cores in each internal node is the same. */ #ifdef CONFIG_X86_HT static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) { -#ifdef CONFIG_PCI - u32 t, cpn; - u8 n, n_id; + unsigned long long value; + u32 nodes, cores_per_node; int cpu = smp_processor_id(); + if (!cpu_has(c, X86_FEATURE_NODEID_MSR)) + return; + /* fixup topology information only once for a core */ if (cpu_has(c, X86_FEATURE_AMD_DCM)) return; - /* check for multi-node processor on boot cpu */ - t = read_pci_config(0, 24, 3, 0xe8); - if (!(t & (1 << 29))) + rdmsrl(MSR_FAM10H_NODE_ID, value); + + nodes = ((value >> 3) & 7) + 1; + if (nodes == 1) return; set_cpu_cap(c, X86_FEATURE_AMD_DCM); + cores_per_node = c->x86_max_cores / nodes; - /* cores per node: each internal node has half the number of cores */ - cpn = c->x86_max_cores >> 1; + /* store NodeID, use llc_shared_map to store sibling info */ + per_cpu(cpu_llc_id, cpu) = value & 7; - /* even-numbered NB_id of this dual-node processor */ - n = c->phys_proc_id << 1; - - /* - * determine internal node id and assign cores fifty-fifty to - * each node of the dual-node processor - */ - t = read_pci_config(0, 24 + n, 3, 0xe8); - n = (t>>30) & 0x3; - if (n == 0) { - if (c->cpu_core_id < cpn) - n_id = 0; - else - n_id = 1; - } else { - if (c->cpu_core_id < cpn) - n_id = 1; - else - n_id = 0; - } - - /* compute entire NodeID, use llc_shared_map to store sibling info */ - per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id; - - /* fixup core id to be in range from 0 to cpn */ - c->cpu_core_id = c->cpu_core_id % cpn; -#endif + /* fixup core id to be in range from 0 to (cores_per_node - 1) */ + c->cpu_core_id = c->cpu_core_id % cores_per_node; } #endif -- cgit v1.2.1 From 6a1e008a0915f502eb026fb995ea3e49d5b017f7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 15 Dec 2009 17:59:03 -0800 Subject: x86: Increase MAX_EARLY_RES; insufficient on 32-bit NUMA Due to recent changes wakeup and mptable, we run out of early reservations on 32-bit NUMA. Thus, adjust the available number. Signed-off-by: Yinghai Lu LKML-Reference: <4B22D754.2020706@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index f50447d961c0..05ed7ab2ca48 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -724,7 +724,7 @@ core_initcall(e820_mark_nvs_memory); /* * Early reserved memory areas. */ -#define MAX_EARLY_RES 20 +#define MAX_EARLY_RES 32 struct early_res { u64 start, end; -- cgit v1.2.1 From a4636818f8e0991f32d9528f39cf4f3d6a7d30a3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 17 Dec 2009 11:43:29 -0600 Subject: cpumask: rename tsk_cpumask to tsk_cpus_allowed Noone uses this wrapper yet, and Ingo asked that it be kept consistent with current task_struct usage. (One user crept in via linux-next: fixed) Signed-off-by: Rusty Russell Cc: Tejun Heo --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index a9df9441a9a2..f125e5c551c0 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1136,7 +1136,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) return -ENOMEM; - cpumask_copy(oldmask, tsk_cpumask(current)); + cpumask_copy(oldmask, tsk_cpus_allowed(current)); set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); if (smp_processor_id() != pol->cpu) { -- cgit v1.2.1 From 61c1917f47f73c968e92d04d15370b1dc3ec4592 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 17 Dec 2009 05:40:33 +0100 Subject: perf events, x86/stacktrace: Make stack walking optional The current print_context_stack helper that does the stack walking job is good for usual stacktraces as it walks through all the stack and reports even addresses that look unreliable, which is nice when we don't have frame pointers for example. But we have users like perf that only require reliable stacktraces, and those may want a more adapted stack walker, so lets make this function a callback in stacktrace_ops that users can tune for their needs. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras LKML-Reference: <1261024834-5336-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 1 + arch/x86/kernel/dumpstack.c | 9 +++++---- arch/x86/kernel/dumpstack.h | 6 ------ arch/x86/kernel/dumpstack_32.c | 2 +- arch/x86/kernel/dumpstack_64.c | 4 ++-- arch/x86/kernel/stacktrace.c | 18 ++++++++++-------- 6 files changed, 19 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 45506d5dd8df..d3802ee5a416 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2336,6 +2336,7 @@ static const struct stacktrace_ops backtrace_ops = { .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, + .walk_stack = print_context_stack, }; #include "../dumpstack.h" diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 0a0aa1cec8f1..8aaa119b7cad 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -141,10 +141,11 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, + .walk_stack = print_context_stack, }; void diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 81086c227ab7..4fd1420faffa 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -14,12 +14,6 @@ #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif -extern unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph); - extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e0ed4c7abb62..ae775ca47b25 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -58,7 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index b13af53883aa..0ad9597073f5 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -188,8 +188,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, id) < 0) break; - bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end, &graph); + bp = ops->walk_stack(tinfo, stack, bp, ops, + data, estack_end, &graph); ops->stack(data, ""); /* * We link to the next stack via the diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index c3eb207181fe..922eefbb3f6c 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -53,17 +53,19 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops save_stack_ops = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, - .stack = save_stack_stack, - .address = save_stack_address, + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address, + .walk_stack = print_context_stack, }; static const struct stacktrace_ops save_stack_ops_nosched = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, - .stack = save_stack_stack, - .address = save_stack_address_nosched, + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address_nosched, + .walk_stack = print_context_stack, }; /* -- cgit v1.2.1 From 06d65bda75341485d32f33da474b0664819ad497 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 17 Dec 2009 05:40:34 +0100 Subject: perf events, x86/stacktrace: Fix performance/softlockup by providing a special frame pointer-only stack walker It's just wasteful for stacktrace users like perf to walk through every entries on the stack whereas these only accept reliable ones, ie: that the frame pointer validates. Since perf requires pure reliable stacktraces, it needs a stack walker based on frame pointers-only to optimize the stacktrace processing. This might solve some near-lockup scenarios that can be triggered by call-graph tracing timer events. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras LKML-Reference: <1261024834-5336-2-git-send-regression-fweisbec@gmail.com> [ v2: fix for modular builds and small detail tidyup ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/dumpstack.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d3802ee5a416..c223b7e895d9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2336,7 +2336,7 @@ static const struct stacktrace_ops backtrace_ops = { .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, - .walk_stack = print_context_stack, + .walk_stack = print_context_stack_bp, }; #include "../dumpstack.h" diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 8aaa119b7cad..c56bc2873030 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -109,6 +109,30 @@ print_context_stack(struct thread_info *tinfo, } return bp; } +EXPORT_SYMBOL_GPL(print_context_stack); + +unsigned long +print_context_stack_bp(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + unsigned long *ret_addr = &frame->return_address; + + while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { + unsigned long addr = *ret_addr; + + if (__kernel_text_address(addr)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + ret_addr = &frame->return_address; + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + } + } + return (unsigned long)frame; +} +EXPORT_SYMBOL_GPL(print_context_stack_bp); static void @@ -143,8 +167,8 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) static const struct stacktrace_ops print_trace_ops = { .warning = print_trace_warning, .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, + .stack = print_trace_stack, + .address = print_trace_address, .walk_stack = print_context_stack, }; -- cgit v1.2.1 From 04a1e62c2cec820501f93526ad1e46073b802dc4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 17 Dec 2009 07:04:56 -0800 Subject: x86/ptrace: make genregs[32]_get/set more robust The loop condition is fragile: we compare an unsigned value to zero, and then decrement it by something larger than one in the loop. All the callers should be passing in appropriately aligned buffer lengths, but it's better to just not rely on it, and have some appropriate defensive loop limits. Acked-by: Roland McGrath Signed-off-by: Linus Torvalds --- arch/x86/kernel/ptrace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2779321046bd..017d937639fe 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -509,14 +509,14 @@ static int genregs_get(struct task_struct *target, { if (kbuf) { unsigned long *k = kbuf; - while (count > 0) { + while (count >= sizeof(*k)) { *k++ = getreg(target, pos); count -= sizeof(*k); pos += sizeof(*k); } } else { unsigned long __user *u = ubuf; - while (count > 0) { + while (count >= sizeof(*u)) { if (__put_user(getreg(target, pos), u++)) return -EFAULT; count -= sizeof(*u); @@ -535,14 +535,14 @@ static int genregs_set(struct task_struct *target, int ret = 0; if (kbuf) { const unsigned long *k = kbuf; - while (count > 0 && !ret) { + while (count >= sizeof(*k) && !ret) { ret = putreg(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const unsigned long __user *u = ubuf; - while (count > 0 && !ret) { + while (count >= sizeof(*u) && !ret) { unsigned long word; ret = __get_user(word, u++); if (ret) @@ -1458,14 +1458,14 @@ static int genregs32_get(struct task_struct *target, { if (kbuf) { compat_ulong_t *k = kbuf; - while (count > 0) { + while (count >= sizeof(*k)) { getreg32(target, pos, k++); count -= sizeof(*k); pos += sizeof(*k); } } else { compat_ulong_t __user *u = ubuf; - while (count > 0) { + while (count >= sizeof(*u)) { compat_ulong_t word; getreg32(target, pos, &word); if (__put_user(word, u++)) @@ -1486,14 +1486,14 @@ static int genregs32_set(struct task_struct *target, int ret = 0; if (kbuf) { const compat_ulong_t *k = kbuf; - while (count > 0 && !ret) { + while (count >= sizeof(*k) && !ret) { ret = putreg32(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const compat_ulong_t __user *u = ubuf; - while (count > 0 && !ret) { + while (count >= sizeof(*u) && !ret) { compat_ulong_t word; ret = __get_user(word, u++); if (ret) -- cgit v1.2.1 From 6c56ccecf05fafe100ab4ea94f6fccbf5ff00db7 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 17 Dec 2009 12:27:02 -0800 Subject: x86: Reenable TSC sync check at boot, even with NONSTOP_TSC Commit 83ce4009 did the following change If the TSC is constant and non-stop, also set it reliable. But, there seems to be few systems that will end up with TSC warp across sockets, depending on how the cpus come out of reset. Skipping TSC sync test on such systems may result in time inconsistency later. So, reenable TSC sync test even on constant and non-stop TSC systems. Set, sched_clock_stable to 1 by default and reset it in mark_tsc_unstable, if TSC sync fails. This change still gives perf benefit mentioned in 83ce4009 for systems where TSC is reliable. Signed-off-by: Venkatesh Pallipadi Acked-by: Suresh Siddha LKML-Reference: <20091217202702.GA18015@linux-os.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel.c | 1 - arch/x86/kernel/tsc.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 9c31e8b09d2c..879666f4d871 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -70,7 +70,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); sched_clock_stable = 1; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cd982f48e23e..597683aa5ba0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -763,6 +763,7 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; + sched_clock_stable = 0; printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) -- cgit v1.2.1 From 18374d89e5fe96772102f44f535efb1198d9be08 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 17 Dec 2009 18:29:46 -0800 Subject: x86, irq: Allow 0xff for /proc/irq/[n]/smp_affinity on an 8-cpu system John Blackwood reported: > on an older Dell PowerEdge 6650 system with 8 cpus (4 are hyper-threaded), > and 32 bit (x86) kernel, once you change the irq smp_affinity of an irq > to be less than all cpus in the system, you can never change really the > irq smp_affinity back to be all cpus in the system (0xff) again, > even though no error status is returned on the "/bin/echo ff > > /proc/irq/[n]/smp_affinity" operation. > > This is due to that fact that BAD_APICID has the same value as > all cpus (0xff) on 32bit kernels, and thus the value returned from > set_desc_affinity() via the cpu_mask_to_apicid_and() function is treated > as a failure in set_ioapic_affinity_irq_desc(), and no affinity changes > are made. set_desc_affinity() is already checking if the incoming cpu mask intersects with the cpu online mask or not. So there is no need for the apic op cpu_mask_to_apicid_and() to check again and return BAD_APICID. Remove the BAD_APICID return value from cpu_mask_to_apicid_and() and also fix set_desc_affinity() to return -1 instead of using BAD_APICID to represent error conditions (as cpu_mask_to_apicid_and() can return logical or physical apicid values and BAD_APICID is really to represent bad physical apic id). Reported-by: John Blackwood Root-caused-by: John Blackwood Signed-off-by: Suresh Siddha LKML-Reference: <1261103386.2535.409.camel@sbs-t61> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic_flat_64.c | 5 +---- arch/x86/kernel/apic/bigsmp_32.c | 5 +---- arch/x86/kernel/apic/io_apic.c | 32 ++++++++++++++------------------ arch/x86/kernel/apic/x2apic_cluster.c | 5 +---- arch/x86/kernel/apic/x2apic_phys.c | 5 +---- arch/x86/kernel/apic/x2apic_uv_x.c | 5 +---- arch/x86/kernel/uv_irq.c | 3 +-- 7 files changed, 20 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index d0c99abc26c3..eacbd2b31d27 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -306,10 +306,7 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_apicid, cpu); } struct apic apic_physflat = { diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 38dcecfa5818..cb804c5091b9 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -131,10 +131,7 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return bigsmp_cpu_to_logical_apicid(cpu); - - return BAD_APICID; + return bigsmp_cpu_to_logical_apicid(cpu); } static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d5d498fbee4b..98ced709e829 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2276,26 +2276,28 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq /* * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and * leaves desc->affinity untouched. */ unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask, + unsigned int *dest_id) { struct irq_cfg *cfg; unsigned int irq; if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; + return -1; irq = desc->irq; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; + return -1; cpumask_copy(desc->affinity, mask); - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + return 0; } static int @@ -2311,12 +2313,11 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) cfg = desc->chip_data; spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { + ret = set_desc_affinity(desc, mask, &dest); + if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); - ret = 0; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -3351,8 +3352,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3384,8 +3384,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) if (get_irte(irq, &irte)) return -1; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; irte.vector = cfg->vector; @@ -3567,8 +3566,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3623,8 +3621,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3730,8 +3727,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) struct irq_cfg *cfg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index a5371ec36776..cf69c59f4910 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -148,10 +148,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_logical_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_logical_apicid, cpu); } static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a8989aadc99a..8972f38c5ced 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -146,10 +146,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_apicid, cpu); } static unsigned int x2apic_phys_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b684bb303cbf..d56b0efb2057 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -225,10 +225,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_apicid, cpu); } static unsigned int x2apic_get_apic_id(unsigned long x) diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 61d805df4c91..ece73d8e3240 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -215,8 +215,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) unsigned long mmr_offset; unsigned mmr_pnode; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; mmr_value = 0; -- cgit v1.2.1 From 0f764806438d5576ac58898332e5dcf30bb8a679 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 21 Dec 2009 15:51:23 +0100 Subject: x86/amd-iommu: Fix initialization failure panic The assumption that acpi_table_parse passes the return value of the hanlder function to the caller proved wrong recently. The return value of the handler function is totally ignored. This makes the initialization code for AMD IOMMU buggy in a way that could cause a kernel panic on initialization. This patch fixes the issue in the AMD IOMMU driver. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 1dca9c34eaeb..fb490ce7dd55 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -137,6 +137,11 @@ int amd_iommus_present; /* IOMMUs have a non-present cache? */ bool amd_iommu_np_cache __read_mostly; +/* + * Set to true if ACPI table parsing and hardware intialization went properly + */ +static bool amd_iommu_initialized; + /* * List of protection domains - used during resume */ @@ -929,6 +934,8 @@ static int __init init_iommu_all(struct acpi_table_header *table) } WARN_ON(p != end); + amd_iommu_initialized = true; + return 0; } @@ -1263,6 +1270,9 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_iommu_all) != 0) goto free; + if (!amd_iommu_initialized) + goto free; + if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; -- cgit v1.2.1 From 1d9cb470a755409ce97c3376174b1e234bd20371 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Sun, 20 Dec 2009 12:19:14 -0700 Subject: ACPI: processor: introduce arch_has_acpi_pdc arch dependent helper function that tells us if we should attempt to evaluate _PDC on this machine or not. The x86 implementation assumes that the CPUs in the machine must be homogeneous, and that you cannot mix CPUs of different vendors. Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Alex Chiang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/processor.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index d85d1b2432ba..bcb6efe08c5d 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -79,9 +79,7 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) struct cpuinfo_x86 *c = &cpu_data(pr->id); pr->pdc = NULL; - if (c->x86_vendor == X86_VENDOR_INTEL || - c->x86_vendor == X86_VENDOR_CENTAUR) - init_intel_pdc(pr, c); + init_intel_pdc(pr, c); return; } -- cgit v1.2.1 From 407cd87c54e76c266245e8faef8dd4a84b7254fe Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Sun, 20 Dec 2009 12:19:19 -0700 Subject: ACPI: processor: unify arch_acpi_processor_init_pdc The x86 and ia64 implementations of arch_acpi_processor_init_pdc() are almost exactly the same. The only difference is in what bits they set in obj_list buffer. Combine the boilerplate memory management code, and leave the arch-specific bit twiddling in separate implementations. Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Alex Chiang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/processor.c | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index bcb6efe08c5d..967860b43f2a 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -14,31 +14,7 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) { - struct acpi_object_list *obj_list; - union acpi_object *obj; - u32 *buf; - - /* allocate and initialize pdc. It will be used later. */ - obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL); - if (!obj_list) { - printk(KERN_ERR "Memory allocation error\n"); - return; - } - - obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL); - if (!obj) { - printk(KERN_ERR "Memory allocation error\n"); - kfree(obj_list); - return; - } - - buf = kmalloc(12, GFP_KERNEL); - if (!buf) { - printk(KERN_ERR "Memory allocation error\n"); - kfree(obj); - kfree(obj_list); - return; - } + u32 *buf = (u32 *)pr->pdc->pointer->buffer.pointer; buf[0] = ACPI_PDC_REVISION_ID; buf[1] = 1; @@ -62,13 +38,6 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_MWAIT)) buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); - obj->type = ACPI_TYPE_BUFFER; - obj->buffer.length = 12; - obj->buffer.pointer = (u8 *) buf; - obj_list->count = 1; - obj_list->pointer = obj; - pr->pdc = obj_list; - return; } @@ -78,7 +47,6 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) { struct cpuinfo_x86 *c = &cpu_data(pr->id); - pr->pdc = NULL; init_intel_pdc(pr, c); return; -- cgit v1.2.1 From 08ea48a326d8030ef5b7fb02292faf5a53c95e0a Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Sun, 20 Dec 2009 12:19:24 -0700 Subject: ACPI: processor: factor out common _PDC settings Both x86 and ia64 initialize _PDC with mostly common bit settings. Factor out the common settings and leave the arch-specific ones alone. Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Alex Chiang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/processor.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index 967860b43f2a..d722ca8cb4c7 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -16,16 +16,8 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) { u32 *buf = (u32 *)pr->pdc->pointer->buffer.pointer; - buf[0] = ACPI_PDC_REVISION_ID; - buf[1] = 1; - buf[2] = ACPI_PDC_C_CAPABILITY_SMP; + buf[2] |= ACPI_PDC_C_CAPABILITY_SMP; - /* - * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so - * that OSPM is capable of native ACPI throttling software - * coordination using BIOS supplied _TSD info. - */ - buf[2] |= ACPI_PDC_SMP_T_SWCOORD; if (cpu_has(c, X86_FEATURE_EST)) buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; -- cgit v1.2.1 From 6c5807d7bc7d051fce00863ffb98d36325501eb2 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Sun, 20 Dec 2009 12:19:29 -0700 Subject: ACPI: processor: finish unifying arch_acpi_processor_init_pdc() The only thing arch-specific about calling _PDC is what bits get set in the input obj_list buffer. There's no need for several levels of indirection to twiddle those bits. Additionally, since we're just messing around with a buffer, we can simplify the interface; no need to pass around the entire struct acpi_processor * just to get at the buffer. Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Alex Chiang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/processor.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index d722ca8cb4c7..0f57307f8224 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -12,40 +12,6 @@ #include #include -static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c) -{ - u32 *buf = (u32 *)pr->pdc->pointer->buffer.pointer; - - buf[2] |= ACPI_PDC_C_CAPABILITY_SMP; - - if (cpu_has(c, X86_FEATURE_EST)) - buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP; - - if (cpu_has(c, X86_FEATURE_ACPI)) - buf[2] |= ACPI_PDC_T_FFH; - - /* - * If mwait/monitor is unsupported, C2/C3_FFH will be disabled - */ - if (!cpu_has(c, X86_FEATURE_MWAIT)) - buf[2] &= ~(ACPI_PDC_C_C2C3_FFH); - - return; -} - - -/* Initialize _PDC data based on the CPU vendor */ -void arch_acpi_processor_init_pdc(struct acpi_processor *pr) -{ - struct cpuinfo_x86 *c = &cpu_data(pr->id); - - init_intel_pdc(pr, c); - - return; -} - -EXPORT_SYMBOL(arch_acpi_processor_init_pdc); - void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr) { if (pr->pdc) { -- cgit v1.2.1 From 47817254b8637b56730aec26eed2c337d3938bb5 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Sun, 20 Dec 2009 12:19:34 -0700 Subject: ACPI: processor: unify arch_acpi_processor_cleanup_pdc The x86 and ia64 implementations of the function in $subject are exactly the same. Also, since the arch-specific implementations of setting _PDC have been completely hollowed out, remove the empty shells. Cc: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Alex Chiang Signed-off-by: Len Brown --- arch/x86/kernel/acpi/Makefile | 2 +- arch/x86/kernel/acpi/processor.c | 25 ------------------------- 2 files changed, 1 insertion(+), 26 deletions(-) delete mode 100644 arch/x86/kernel/acpi/processor.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index fd5ca97a2ad5..6f35260bb3ef 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o ifneq ($(CONFIG_ACPI_PROCESSOR),) -obj-y += cstate.o processor.o +obj-y += cstate.o endif $(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c deleted file mode 100644 index 0f57307f8224..000000000000 --- a/arch/x86/kernel/acpi/processor.c +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (C) 2005 Intel Corporation - * Venkatesh Pallipadi - * - Added _PDC for platforms with Intel CPUs - */ - -#include -#include -#include -#include - -#include -#include - -void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr) -{ - if (pr->pdc) { - kfree(pr->pdc->pointer->buffer.pointer); - kfree(pr->pdc->pointer); - kfree(pr->pdc); - pr->pdc = NULL; - } -} - -EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc); -- cgit v1.2.1 From 4a28395d72a956f2dad24e343d06bc08c9afb89a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 21 Dec 2009 16:19:58 -0800 Subject: arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c: avoid cross-CPU interrupts by using smp_call_function_any() Presently acpi-cpufreq will perform the MSR read on the first CPU in the mask. That's inefficient if that CPU differs from the current CPU. Because we have to perform a cross-CPU call, but we could have run the rdmsr on the current CPU. So switch to using the new smp_call_function_any(), which will perform the call on the current CPU if that CPU is present in the mask (it is). Cc: "Zhang, Yanmin" Cc: Dave Jones Cc: Ingo Molnar Cc: Jaswinder Singh Rajput Cc: Len Brown Cc: Mike Galbraith Cc: Rusty Russell Cc: Thomas Gleixner Cc: Venkatesh Pallipadi Cc: Zhao Yakui Signed-off-by: Andrew Morton Signed-off-by: Len Brown --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index f28decf8dde3..1b1920fa7c80 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -190,9 +190,11 @@ static void do_drv_write(void *_cmd) static void drv_read(struct drv_cmd *cmd) { + int err; cmd->val = 0; - smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1); + err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1); + WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */ } static void drv_write(struct drv_cmd *cmd) -- cgit v1.2.1 From 2f99f5c8f05e02f3df1bb4d93b6704e6f5972872 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 23 Dec 2009 15:04:53 -0800 Subject: Revert "x86, ucode-amd: Ensure ucode update on suspend/resume after CPU off/online cycle" This reverts commit 9f15226e75583547aaf542c6be4bdac1060dd425. It's just wrong, and broke resume for Rafael even on a non-AMD CPU. As Rafael says: "... it causes microcode_init_cpu() to be called during resume even for CPUs for which there's no microcode to apply. That, in turn, results in executing request_firmware() (on Intel CPUs at least) which doesn't work at this stage of resume (we have device interrupts disabled, I/O devices are still suspended and so on). If I'm not mistaken, the "if (uci->valid)" logic means "if that CPU is known to us" , so before commit 9f15226e755 microcode_resume_cpu() was called for all CPUs already in the system during suspend, which was the right thing to do. The commit changed it so that the CPUs without microcode to apply are now treated as "unknown", which is not quite right. The problem this commit attempted to solve has to be handled differently." Bisected-and -requested-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- arch/x86/kernel/microcode_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 844c02c65fcb..0c8632433090 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -394,7 +394,7 @@ static enum ucode_state microcode_update_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; enum ucode_state ustate; - if (uci->valid && uci->mc) + if (uci->valid) ustate = microcode_resume_cpu(cpu); else ustate = microcode_init_cpu(cpu); -- cgit v1.2.1 From d015a092989d673df44a5ad6866dc5d5006b7a2a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Mon, 28 Dec 2009 10:26:59 +0200 Subject: x86: Use KERN_DEFAULT log-level in __show_regs() Andrew Morton reported a strange looking kmemcheck warning: WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88004fba6c20) 0000000000000000310000000000000000000000000000002413000000c9ffff u u u u u u u u u u u u u u u u i i i i i i i i u u u u u u u u [] kmemleak_scan+0x25a/0x540 [] kmemleak_scan_thread+0x5b/0xe0 [] kthread+0x9e/0xb0 [] kernel_thread_helper+0x4/0x10 [] 0xffffffffffffffff The above printout is missing register dump completely. The problem here is that the output comes from syslog which doesn't show KERN_INFO log-level messages. We didn't see this before because both of us were testing on 32-bit kernels which use the _default_ log-level. Fix that up by explicitly using KERN_DEFAULT log-level for __show_regs() printks. Signed-off-by: Pekka Enberg Cc: Vegard Nossum Cc: Andrew Morton Cc: Arjan van de Ven Cc: Linus Torvalds LKML-Reference: <1261988819.4641.2.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 4 ++-- arch/x86/kernel/process_32.c | 14 +++++++------- arch/x86/kernel/process_64.c | 24 ++++++++++++------------ 3 files changed, 21 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 98c2cdeb599e..c6ee241c8a98 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -103,8 +103,8 @@ void show_regs_common(void) if (!product) product = ""; - printk("\n"); - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", + printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9c517b5858f0..37ad1e046aae 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -139,16 +139,16 @@ void __show_regs(struct pt_regs *regs, int all) show_regs_common(); - printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, smp_processor_id()); print_symbol("EIP is at %s\n", regs->ip); - printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); if (!all) @@ -158,19 +158,19 @@ void __show_regs(struct pt_regs *regs, int all) cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4_safe(); - printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); get_debugreg(d3, 3); - printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", d0, d1, d2, d3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR6: %08lx DR7: %08lx\n", + printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", d6, d7); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 52fbd0c60198..f9e033150cdf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,19 +161,19 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int ds, cs, es; show_regs_common(); - printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); - printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, regs->flags); - printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); - printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->dx, regs->si, regs->di); - printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); - printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); - printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); asm("movl %%ds,%0" : "=r" (ds)); @@ -194,21 +194,21 @@ void __show_regs(struct pt_regs *regs, int all) cr3 = read_cr3(); cr4 = read_cr4(); - printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) -- cgit v1.2.1 From 39d30770992895d55789de64bad2349510af68d0 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 28 Dec 2009 13:28:25 -0800 Subject: x86: SGI UV: Fix writes to led registers on remote uv hubs The wrong address was being used to write the SCIR led regs on remote hubs. Also, there was an inconsistency between how BIOS and the kernel indexed these regs. Standardize on using the lower 6 bits of the APIC ID as the index. This patch fixes the problem of writing to an errant address to a cpu # >= 64. Signed-off-by: Mike Travis Reviewed-by: Jack Steiner Cc: Robin Holt Cc: Linus Torvalds Cc: stable@kernel.org LKML-Reference: <4B3922F9.3060905@sgi.com> [ v2: fix a number of annoying checkpatch artifacts and whitespace noise ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d56b0efb2057..5f92494dab61 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -629,8 +629,10 @@ void __init uv_system_init(void) uv_rtc_init(); for_each_present_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + nid = cpu_to_node(cpu); - pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; @@ -651,15 +653,13 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; - uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; + uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); - printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " - "lcpu %d, blade %d\n", - cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, - lcpu, blade); + printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n", + cpu, apicid, pnode, nid, lcpu, blade); } /* Add blade/pnode info for nodes without cpus */ -- cgit v1.2.1 From 9a7262a0563da6b91019156abf487bcdf1a41526 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 28 Dec 2009 13:28:25 -0800 Subject: x86_64 SGI UV: Fix writes to led registers on remote uv hubs. The wrong address was being used to write the SCIR led regs on remote hubs. Also, there was an inconsistency between how BIOS and the kernel indexed these regs. Standardize on using the lower 6 bits of the APIC ID as the index. This patch fixes the problem of writing to an errant address to a cpu # >= 64. Signed-off-by: Mike Travis Reviewed-by: Jack Steiner Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/apic/x2apic_uv_x.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d56b0efb2057..5f92494dab61 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -629,8 +629,10 @@ void __init uv_system_init(void) uv_rtc_init(); for_each_present_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + nid = cpu_to_node(cpu); - pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; @@ -651,15 +653,13 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; - uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; + uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); - printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " - "lcpu %d, blade %d\n", - cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, - lcpu, blade); + printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n", + cpu, apicid, pnode, nid, lcpu, blade); } /* Add blade/pnode info for nodes without cpus */ -- cgit v1.2.1 From d7f0eea9e431e1b8b0742a74db1a9490730b2a25 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 30 Dec 2009 15:36:42 +0800 Subject: ACPI: introduce kernel parameter acpi_sleep=sci_force_enable Introduce kernel parameter acpi_sleep=sci_force_enable some laptop requires SCI_EN being set directly on resume, or else they hung somewhere in the resume code path. We already have a blacklist for these laptops but we still need this option, especially when debugging some suspend/resume problems, in case there are systems that need this workaround and are not yet in the blacklist. Signed-off-by: Zhang Rui Acked-by: Rafael J. Wysocki Signed-off-by: Len Brown --- arch/x86/kernel/acpi/sleep.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 82e508677b91..f9961034e557 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); + if (strncmp(str, "sci_force_enable", 16) == 0) + acpi_set_sci_en_on_resume(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); -- cgit v1.2.1 From 48b5ba9cc98d676712da29d9931f1c88e5185ff2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 31 Dec 2009 05:53:02 +0100 Subject: perf: Pass appropriate frame pointer to dump_trace() Pass the frame pointer from the regs of the interrupted path to dump_trace() while processing the stack trace. Currently, dump_trace() takes the current bp and starts the callchain from dump_trace() itself. This is wasteful because we need to walk through the entire NMI/DEBUG stack before retrieving the interrupted point. We can fix that by just using the frame pointer from the captured regs. It points exactly where we want to start. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras LKML-Reference: <1262235183-5320-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras --- arch/x86/kernel/cpu/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c223b7e895d9..d616c06e99b4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2347,7 +2347,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_KERNEL); callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); } /* -- cgit v1.2.1 From 9dad0fd5a73d4048dff18069733c0b515f68df74 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 22 Dec 2009 15:40:39 -0800 Subject: x86: Fix size for ex trampoline with 32bit fix for error that is introduced by | x86: Use find_e820() instead of hard coded trampoline address it should end with PAGE_SIZE + PAGE_SIZE Signed-off-by: Yinghai Lu LKML-Reference: <1261525263-13763-2-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 05ed7ab2ca48..a1a7876cadcb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -733,13 +733,13 @@ struct early_res { }; static struct early_res early_res[MAX_EARLY_RES] __initdata = { { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE) /* * But first pinch a few for the stack/trampoline stuff * FIXME: Don't need the extra page at 4K, but need to fix * trampoline before removing it. (see the GDT stuff) */ - { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 }, + { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 }, #endif {} -- cgit v1.2.1 From 7f41c2e1523f628cc248e34192162aec5728bed7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 6 Jan 2010 10:56:31 -0800 Subject: x86, irq: Check move_in_progress before freeing the vector mapping With the recent irq migration fixes (post 2.6.32), Gary Hade has noticed "No IRQ handler for vector" messages during the 2.6.33-rc1 kernel boot on IBM AMD platforms and root caused the issue to this commit: > commit 23359a88e7eca3c4f402562b102f23014db3c2aa > Author: Suresh Siddha > Date: Mon Oct 26 14:24:33 2009 -0800 > > x86: Remove move_cleanup_count from irq_cfg As part of this patch, we have removed the move_cleanup_count check in smp_irq_move_cleanup_interrupt(). With this change, we can run into a situation where an irq cleanup interrupt on a cpu can cleanup the vector mappings associated with multiple irqs, of which one of the irq's migration might be still in progress. As such when that irq hits the old cpu, we get the "No IRQ handler" messages. Fix this by checking for the irq_cfg's move_in_progress and if the move is still in progress delay the vector cleanup to another irq cleanup interrupt request (which will happen when the irq starts arriving at the new cpu destination). Reported-and-tested-by: Gary Hade Signed-off-by: Suresh Siddha LKML-Reference: <1262804191.2732.7.camel@sbs-t61.sc.intel.com> Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index de00c4619a55..53243ca7816d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2434,6 +2434,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) cfg = irq_cfg(irq); raw_spin_lock(&desc->lock); + /* + * Check if the irq migration is in progress. If so, we + * haven't received the cleanup request yet for this irq. + */ + if (cfg->move_in_progress) + goto unlock; + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) goto unlock; -- cgit v1.2.1 From 8558e3943df1c51c3377cb4e8a52ea484d6f357d Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Jan 2010 16:11:06 -0500 Subject: x86, ACPI: delete acpi_boot_table_init() return value cleanup only. setup_arch(), doesn't care care if ACPI initialization succeeded or failed, so delete acpi_boot_table_init()'s return value. Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fb1035cd9a6a..036d28adf59d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1529,16 +1529,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { * if acpi_blacklisted() acpi_disabled = 1; * acpi_irq_model=... * ... - * - * return value: (currently ignored) - * 0: success - * !0: failure */ -int __init acpi_boot_table_init(void) +void __init acpi_boot_table_init(void) { - int error; - dmi_check_system(acpi_dmi_table); /* @@ -1546,15 +1540,14 @@ int __init acpi_boot_table_init(void) * One exception: acpi=ht continues far enough to enumerate LAPICs */ if (acpi_disabled && !acpi_ht) - return 1; + return; /* * Initialize the ACPI boot-time table parser. */ - error = acpi_table_init(); - if (error) { + if (acpi_table_init()) { disable_acpi(); - return error; + return; } acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1562,18 +1555,15 @@ int __init acpi_boot_table_init(void) /* * blacklist may disable ACPI entirely */ - error = acpi_blacklisted(); - if (error) { + if (acpi_blacklisted()) { if (acpi_force) { printk(KERN_WARNING PREFIX "acpi=force override\n"); } else { printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); disable_acpi(); - return error; + return; } } - - return 0; } int __init early_acpi_boot_init(void) -- cgit v1.2.1 From 066000dd856709b6980123eb39b957fe26993f7b Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Mon, 11 Jan 2010 15:51:04 -0800 Subject: Revert "x86, apic: Use logical flat on intel with <= 8 logical cpus" Revert commit 2fbd07a5f5d1295fa9b0c0564ec27da7c276a75a, as this commit breaks an IBM platform with quad-core Xeon cpu's. According to Suresh, this might be an IBM platform issue, as on other Intel platforms with <= 8 logical cpu's, logical flat mode works fine irespective of physical apic id values (inline with the xapic architecture). Revert this for now because of the IBM platform breakage. Another version will be re-submitted after the complete analysis. Signed-off-by: Ananth N Mavinakayanahalli Acked-by: Suresh Siddha Signed-off-by: Linus Torvalds --- arch/x86/kernel/apic/apic.c | 26 ++++++++++++++++++-------- arch/x86/kernel/apic/probe_64.c | 15 ++++----------- 2 files changed, 22 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index aa57c079c98f..e80f291472a4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -62,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. * - * On AMD, this determines the messaging protocol we can use: if all APIC IDs + * This determines the messaging protocol we can use: if all APIC IDs * are in the 0 ... 7 range, then we can use logical addressing which * has some performance advantages (better broadcasting). * @@ -1898,14 +1898,24 @@ void __cpuinit generic_processor_info(int apicid, int version) max_physical_apicid = apicid; #ifdef CONFIG_X86_32 - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - def_to_bigsmp = 1; - break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) + /* + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y + * but we need to work other dependencies like SMP_SUSPEND etc + * before this can be done without some confusion. + * if (CPU_HOTPLUG_ENABLED || num_processors > 8) + * - Ashok Raj + */ + if (max_physical_apicid >= 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: def_to_bigsmp = 1; + } } #endif diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index c4cbd3080c1c..65edc180fc82 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -64,23 +64,16 @@ void __init default_setup_apic_routing(void) apic = &apic_x2apic_phys; else apic = &apic_x2apic_cluster; + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } #endif if (apic == &apic_flat) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - apic = &apic_physflat; - break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) - apic = &apic_physflat; - } + if (max_physical_apicid >= 8) + apic = &apic_physflat; + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); - if (is_vsmp_box()) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; -- cgit v1.2.1 From c2c5d45d46c8c0fd34291dec958670ad4816796f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 31 Dec 2009 03:52:25 +0100 Subject: perf: Stop stack frame walking off kernel addresses boundaries While processing kernel perf callchains, an bad entry can be considered as a valid stack pointer but not as a kernel address. In this case, we hang in an endless loop. This can happen in an x86-32 kernel after processing the last entry in a kernel stacktrace. Just stop the stack frame walking after we encounter an invalid kernel address. This fixes a hard lockup in x86-32. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras LKML-Reference: <1262227945-27014-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c56bc2873030..6d817554780a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -123,13 +123,15 @@ print_context_stack_bp(struct thread_info *tinfo, while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { unsigned long addr = *ret_addr; - if (__kernel_text_address(addr)) { - ops->address(data, addr, 1); - frame = frame->next_frame; - ret_addr = &frame->return_address; - print_ftrace_graph_addr(addr, data, ops, tinfo, graph); - } + if (!__kernel_text_address(addr)) + break; + + ops->address(data, addr, 1); + frame = frame->next_frame; + ret_addr = &frame->return_address; + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); } + return (unsigned long)frame; } EXPORT_SYMBOL_GPL(print_context_stack_bp); -- cgit v1.2.1 From fcfbb2b5facd65efa7284cc315225bfe3d1856c2 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 8 Jan 2010 12:13:54 -0800 Subject: x86: SGI UV: Fix mapping of MMIO registers This fixes the problem of the initialization code not correctly mapping the entire MMIO space on a UV system. A side effect is the map_high() interface needed to be changed to accommodate different address and size shifts. Signed-off-by: Mike Travis Reviewed-by: Mike Habeck Cc: Cc: Jack Steiner Cc: Linus Torvalds LKML-Reference: <4B479202.7080705@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 5f92494dab61..b8bb869a6618 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -374,13 +374,13 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) enum map_type {map_wb, map_uc}; -static __init void map_high(char *id, unsigned long base, int shift, - int max_pnode, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int pshift, + int bshift, int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; - paddr = base << shift; - bytes = (1UL << shift) * (max_pnode + 1); + paddr = base << pshift; + bytes = (1UL << bshift) * (max_pnode + 1); printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) @@ -396,7 +396,7 @@ static __init void map_gru_high(int max_pnode) gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (gru.s.enable) { - map_high("GRU", gru.s.base, shift, max_pnode, map_wb); + map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); gru_start_paddr = ((u64)gru.s.base << shift); gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); @@ -410,7 +410,7 @@ static __init void map_mmr_high(int max_pnode) mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); + map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); } static __init void map_mmioh_high(int max_pnode) @@ -420,7 +420,8 @@ static __init void map_mmioh_high(int max_pnode) mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); if (mmioh.s.enable) - map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); + map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io, + max_pnode, map_uc); } static __init void map_low_mmrs(void) -- cgit v1.2.1 From 42590a75019a50012f25a962246498dead428433 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 4 Jan 2010 16:16:23 +0900 Subject: x86/agp: Fix agp_amd64_init and agp_amd64_cleanup This fixes the regression introduced by the commit f405d2c02395a74d3883bd03ded36457aa3697ad. The above commit fixes the following issue: http://marc.info/?l=linux-kernel&m=126192729110083&w=2 However, it doesn't work properly when you remove and insert the agp_amd64 module again. agp_amd64_init() and agp_amd64_cleanup should be called only when gart_iommu is not called earlier (that is, the GART IOMMU is not enabled). We need to use 'gart_iommu_aperture' to see if GART IOMMU is enabled or not. Signed-off-by: FUJITA Tomonori Cc: mitov@issp.bas.bg Cc: davej@redhat.com LKML-Reference: <20100104161603L.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3704997e8b25..f147a95fd84a 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -31,6 +31,7 @@ #include int gart_iommu_aperture; +EXPORT_SYMBOL_GPL(gart_iommu_aperture); int gart_iommu_aperture_disabled __initdata; int gart_iommu_aperture_allowed __initdata; -- cgit v1.2.1 From 864a0922dd128392467611d9857e5138c6a91999 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 13 Jan 2010 10:16:07 +0000 Subject: x86: kernel_thread() -- initialize SS to a known state Before the kernel_thread was converted into "C" we had pt_regs::ss set to __KERNEL_DS (by SAVE_ALL asm macro). Though I must admit I didn't find any *explicit* load of %ss from this structure the better to be on a safe side and set it to a known value. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ian Campbell Cc: Christian Kujau Cc: Jeremy Fitzhardinge Cc: Brian Gerst LKML-Reference: <1263377768-19600-1-git-send-email-ian.campbell@citrix.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c6ee241c8a98..02c3ee013ccd 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -288,6 +288,8 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.es = __USER_DS; regs.fs = __KERNEL_PERCPU; regs.gs = __KERNEL_STACK_CANARY; +#else + regs.ss = __KERNEL_DS; #endif regs.orig_ax = -1; -- cgit v1.2.1 From 557a701c16553b0b691dbb64ef30361115a80f64 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Mon, 14 Dec 2009 11:44:15 +0100 Subject: [CPUFREQ] Fix use after free of struct powernow_k8_data Easy fix for a regression introduced in 2.6.31. On managed CPUs the cpufreq.c core will call driver->exit(cpu) on the managed cpus and powernow_k8 will free the core's data. Later driver->get(cpu) function might get called trying to read out the current freq of a managed cpu and the NULL pointer check does not work on the freed object -> better set it to NULL. ->get() is unsigned and must return 0 as invalid frequency. Reference: http://bugzilla.kernel.org/show_bug.cgi?id=14391 Signed-off-by: Thomas Renninger Tested-by: Michal Schmidt CC: stable@kernel.org Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index a9df9441a9a2..2da4fa3bf6e9 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1356,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) kfree(data->powernow_table); kfree(data); + per_cpu(powernow_data, pol->cpu) = NULL; return 0; } @@ -1375,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu) int err; if (!data) - return -EINVAL; + return 0; smp_call_function_single(cpu, query_values_on_cpu, &err, true); if (err) -- cgit v1.2.1 From 7a1110e861b2666ac09f5708d6fbe71d18ce64bb Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 12 Jan 2010 15:09:04 -0600 Subject: x86, uv: Add function retrieving node controller revision number Add function for determining the revision id of the SGI UV node controller chip (HUB). This function is needed in a subsequent patch. Signed-off-by: Jack Steiner LKML-Reference: <20100112210904.GA24546@sgi.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/x2apic_uv_x.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b8bb869a6618..0e48de9ff864 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -36,6 +36,8 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; +int uv_min_hub_revision_id; +EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); static inline bool is_GRU_range(u64 start, u64 end) { @@ -55,6 +57,10 @@ static int early_get_nodeid(void) mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); node_id.v = *mmr; early_iounmap(mmr, sizeof(*mmr)); + + /* Currently, all blades have same revision number */ + uv_min_hub_revision_id = node_id.s.revision; + return node_id.s.node_id; } -- cgit v1.2.1 From 1d2c867c941d635e53e8ad7bf37d060bb5b25ec5 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 15 Jan 2010 12:09:09 -0600 Subject: x86, uv: Ensure hub revision set for all ACPI modes. Ensure that UV hub revision is set for all ACPI modes. Signed-off-by: Russ Anderson LKML-Reference: <20100115180908.GB7757@sgi.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/x2apic_uv_x.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 0e48de9ff864..21db3cbea7dc 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -66,7 +66,10 @@ static int early_get_nodeid(void) static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { + int nodeid; + if (!strcmp(oem_id, "SGI")) { + nodeid = early_get_nodeid(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; @@ -74,7 +77,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { __get_cpu_var(x2apic_extra_bits) = - early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); + nodeid << (UV_APIC_PNODE_SHIFT - 1); uv_system_type = UV_NON_UNIQUE_APIC; return 1; } -- cgit v1.2.1 From dfea91d5a7c795fd6f4e1a97489a98e4e767463e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 18 Jan 2010 12:10:48 -0800 Subject: x86, apic: use physical mode for IBM summit platforms Chris McDermott from IBM confirmed that hurricane chipset in IBM summit platforms doesn't support logical flat mode. Irrespective of the other things like apic_id's, total number of logical cpu's, Linux kernel should default to physical mode for this system. The 32-bit kernel does so using the OEM checks for the IBM summit platform. Add a similar OEM platform check for the 64bit kernel too. Otherwise the linux kernel boot can hang on this platform under certain bios/platform settings. Signed-off-by: Suresh Siddha Tested-by: Ananth N Mavinakayanahalli Cc: Chris McDermott Cc: Yinghai Lu Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/apic/apic_flat_64.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index eacbd2b31d27..e3c3d820c325 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; } + + if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { + printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); + return 1; + } #endif return 0; -- cgit v1.2.1 From bb668da6d6f2bec8a63838c098d9515eccb22cc4 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 18 Jan 2010 12:10:49 -0800 Subject: x86, apic: use logical flat for systems with <= 8 logical cpus We can use logical flat mode if there are <= 8 logical cpu's (irrespective of physical apic id values). This will enable simplified and efficient IPI and device interrupt routing on such platforms. This has been tested to work on both Intel and AMD platforms. Exceptions like IBM summit platform which can't use logical flat mode are addressed by using OEM platform checks. Signed-off-by: Suresh Siddha Signed-off-by: Yinghai Lu Cc: Ananth N Mavinakayanahalli Cc: Chris McDermott Signed-off-by: Linus Torvalds --- arch/x86/kernel/apic/apic.c | 15 +-------------- arch/x86/kernel/apic/probe_64.c | 8 +++----- 2 files changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e80f291472a4..3987e4408f75 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. - * - * This determines the messaging protocol we can use: if all APIC IDs - * are in the 0 ... 7 range, then we can use logical addressing which - * has some performance advantages (better broadcasting). - * - * If there's an APIC ID above 8, we use physical addressing. */ unsigned int max_physical_apicid; @@ -1898,14 +1892,7 @@ void __cpuinit generic_processor_info(int apicid, int version) max_physical_apicid = apicid; #ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj - */ - if (max_physical_apicid >= 8) { + if (num_processors > 8) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: if (!APIC_XAPIC(version)) { diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 65edc180fc82..450fe2064a14 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -64,15 +64,13 @@ void __init default_setup_apic_routing(void) apic = &apic_x2apic_phys; else apic = &apic_x2apic_cluster; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } #endif - if (apic == &apic_flat) { - if (max_physical_apicid >= 8) + if (apic == &apic_flat && num_processors > 8) apic = &apic_physflat; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); - } + + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); if (is_vsmp_box()) { /* need to update phys_pkg_id */ -- cgit v1.2.1 From b27d515a49169e5e2a92d621faac761074a8c5b1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 18 Jan 2010 10:58:01 +0200 Subject: perf: x86: Add support for the ANY bit Propagate the ANY bit into the fixed counter config for v3 and higher. Signed-off-by: Stephane Eranian [a.p.zijlstra@chello.nl: split from larger patch] Signed-off-by: Peter Zijlstra LKML-Reference: <4b5430c6.0f975e0a.1bf9.ffff85fe@mx.google.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d616c06e99b4..8c1c07073ccc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1343,6 +1343,13 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) bits |= 0x2; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) bits |= 0x1; + + /* + * ANY bit is supported in v3 and up + */ + if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) + bits |= 0x4; + bits <<= (idx * 4); mask = 0xfULL << (idx * 4); -- cgit v1.2.1 From d91afd15b041f27d34859c79afa9e172018a86f4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 Jan 2010 16:40:20 +0100 Subject: x86/amd-iommu: Fix possible integer overflow The variable i in this function could be increased to over 2**32 which would result in an integer overflow when using int. Fix it by changing i to unsigned long. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 23824fef789c..c2ccbd7b862f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -980,7 +980,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; struct amd_iommu *iommu; - int i; + unsigned long i; #ifdef CONFIG_IOMMU_STRESS populate = false; -- cgit v1.2.1 From 2ca762790caf822f7b61430fbaffa3ae4219977f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 Jan 2010 16:45:31 +0100 Subject: x86/amd-iommu: Fix NULL pointer dereference in __detach_device() In the __detach_device function the reference count for a device-domain binding may become zero. This results in the device being removed from the domain and dev_data->domain will be NULL. This is bad because this pointer is dereferenced when trying to unlock the domain->lock. This patch fixes the issue by keeping the domain in a seperate variable. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c2ccbd7b862f..4478a48198a8 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1489,11 +1489,14 @@ static void __detach_device(struct device *dev) { struct iommu_dev_data *dev_data = get_dev_data(dev); struct iommu_dev_data *alias_data; + struct protection_domain *domain; unsigned long flags; BUG_ON(!dev_data->domain); - spin_lock_irqsave(&dev_data->domain->lock, flags); + domain = dev_data->domain; + + spin_lock_irqsave(&domain->lock, flags); if (dev_data->alias != dev) { alias_data = get_dev_data(dev_data->alias); @@ -1504,7 +1507,7 @@ static void __detach_device(struct device *dev) if (atomic_dec_and_test(&dev_data->bind)) do_detach(dev); - spin_unlock_irqrestore(&dev_data->domain->lock, flags); + spin_unlock_irqrestore(&domain->lock, flags); /* * If we run in passthrough mode the device must be assigned to the -- cgit v1.2.1 From f5325094379158e6b876ea0010c807bf7890ec8f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 Jan 2010 17:44:35 +0100 Subject: x86/amd-iommu: Fix IOMMU-API initialization for iommu=pt This patch moves the initialization of the iommu-api out of the dma-ops initialization code. This ensures that the iommu-api is initialized even with iommu=pt. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 8 ++++++-- arch/x86/kernel/amd_iommu_init.c | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 4478a48198a8..751ce73c6e1b 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2221,6 +2221,12 @@ static struct dma_map_ops amd_iommu_dma_ops = { /* * The function which clues the AMD IOMMU driver into dma_ops. */ + +void __init amd_iommu_init_api(void) +{ + register_iommu(&amd_iommu_ops); +} + int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; @@ -2256,8 +2262,6 @@ int __init amd_iommu_init_dma_ops(void) /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; - register_iommu(&amd_iommu_ops); - amd_iommu_stats_init(); return 0; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index fb490ce7dd55..9dc91b431470 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1292,9 +1292,12 @@ static int __init amd_iommu_init(void) ret = amd_iommu_init_passthrough(); else ret = amd_iommu_init_dma_ops(); + if (ret) goto free; + amd_iommu_init_api(); + amd_iommu_init_notifier(); enable_iommus(); -- cgit v1.2.1 From d3ad9373b7c29b63d5e8460a69453718d200cc3b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 22 Jan 2010 17:55:27 +0100 Subject: x86/amd-iommu: Fix deassignment of a device from the pt_domain Deassigning a device from the passthrough domain does not work and breaks device assignment to kvm guests. This patch fixes the issue. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 751ce73c6e1b..adb0ba025702 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1511,9 +1511,11 @@ static void __detach_device(struct device *dev) /* * If we run in passthrough mode the device must be assigned to the - * passthrough domain if it is detached from any other domain + * passthrough domain if it is detached from any other domain. + * Make sure we can deassign from the pt_domain itself. */ - if (iommu_pass_through && dev_data->domain == NULL) + if (iommu_pass_through && + (dev_data->domain == NULL && domain != pt_domain)) __attach_device(dev, pt_domain); } -- cgit v1.2.1 From 73472a46b5b28116b145fb5fc05242c1aa8e1461 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Thu, 21 Jan 2010 11:09:52 -0800 Subject: x86: Disable HPET MSI on ATI SB700/SB800 HPET MSI on platforms with ATI SB700/SB800 as they seem to have some side-effects on floppy DMA. Do not use HPET MSI on such platforms. Original problem report from Mark Hounschell http://lkml.indiana.edu/hypermail/linux/kernel/0912.2/01118.html [ This patch needs to go to stable as well. But, there are some conflicts that prevents the patch from going as is. I can rebase/resubmit to stable once the patch goes upstream. hpa: still Cc:'ing stable@ as an FYI. ] Tested-by: Mark Hounschell Signed-off-by: Venkatesh Pallipadi Cc: LKML-Reference: <20100121190952.GA32523@linux-os.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/hpet.c | 8 ++++++++ arch/x86/kernel/quirks.c | 13 +++++++++++++ 2 files changed, 21 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba6e65884603..ad80a1c718c6 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -34,6 +34,8 @@ */ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ +u8 hpet_msi_disable; + #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -596,6 +598,9 @@ static void hpet_msi_capability_lookup(unsigned int start_timer) unsigned int num_timers_used = 0; int i; + if (hpet_msi_disable) + return; + if (boot_cpu_has(X86_FEATURE_ARAT)) return; id = hpet_readl(HPET_ID); @@ -928,6 +933,9 @@ static __init int hpet_late_init(void) hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (hpet_msi_disable) + return 0; + if (boot_cpu_has(X86_FEATURE_ARAT)) return 0; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 18093d7498f0..12e9feaa2f7a 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -491,6 +491,19 @@ void force_hpet_resume(void) break; } } + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + #endif #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) -- cgit v1.2.1 From 3b2e3d85aeb80769fb96c15ee4f6e14135328471 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 22 Jan 2010 21:34:56 +0100 Subject: Revert "x86: ucode-amd: Load ucode-patches once ..." Commit d1c84f79a6ba992dc01e312c44a21496303874d6 leads to a regression when microcode_amd.c is compiled into the kernel. It causes a big boot delay because the firmware is not available. See http://marc.info/?l=linux-kernel&m=126267290920060 It also renders the reload sysfs attribute useless. Fixing this is too intrusive for an -rc5 kernel. Thus I'd like to restore the microcode loading behaviour of kernel 2.6.32. CC: Gene Heskett Signed-off-by: Andreas Herrmann LKML-Reference: <20100122203456.GB13792@alberich.amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/microcode_amd.c | 44 ++++++++++++---------------------------- arch/x86/kernel/microcode_core.c | 6 ------ 2 files changed, 13 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 37542b67c57e..e1af7c055c7d 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -36,9 +36,6 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 -const struct firmware *firmware; -static int supported_cpu; - struct equiv_cpu_entry { u32 installed_cpu; u32 fixed_errata_mask; @@ -77,12 +74,15 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { + struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - if (!supported_cpu) - return -1; - memset(csig, 0, sizeof(*csig)); + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " + "supported\n", cpu, c->x86); + return -1; + } rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); return 0; @@ -294,10 +294,14 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) static enum ucode_state request_microcode_fw(int cpu, struct device *device) { + const char *fw_name = "amd-ucode/microcode_amd.bin"; + const struct firmware *firmware; enum ucode_state ret; - if (firmware == NULL) + if (request_firmware(&firmware, fw_name, device)) { + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); return UCODE_NFOUND; + } if (*(u32 *)firmware->data != UCODE_MAGIC) { pr_err("invalid UCODE_MAGIC (0x%08x)\n", @@ -307,6 +311,8 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) ret = generic_load_microcode(cpu, firmware->data, firmware->size); + release_firmware(firmware); + return ret; } @@ -325,31 +331,7 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } -void init_microcode_amd(struct device *device) -{ - const char *fw_name = "amd-ucode/microcode_amd.bin"; - struct cpuinfo_x86 *c = &boot_cpu_data; - - WARN_ON(c->x86_vendor != X86_VENDOR_AMD); - - if (c->x86 < 0x10) { - pr_warning("AMD CPU family 0x%x not supported\n", c->x86); - return; - } - supported_cpu = 1; - - if (request_firmware(&firmware, fw_name, device)) - pr_err("failed to load file %s\n", fw_name); -} - -void fini_microcode_amd(void) -{ - release_firmware(firmware); -} - static struct microcode_ops microcode_amd_ops = { - .init = init_microcode_amd, - .fini = fini_microcode_amd, .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 0c8632433090..cceb5bc3c3c2 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -521,9 +521,6 @@ static int __init microcode_init(void) return PTR_ERR(microcode_pdev); } - if (microcode_ops->init) - microcode_ops->init(µcode_pdev->dev); - get_online_cpus(); mutex_lock(µcode_mutex); @@ -566,9 +563,6 @@ static void __exit microcode_exit(void) platform_device_unregister(microcode_pdev); - if (microcode_ops->fini) - microcode_ops->fini(); - microcode_ops = NULL; pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); -- cgit v1.2.1 From b160091802d4a76dd063facb09fcf10bf5d5d747 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 23 Jan 2010 18:27:47 -0800 Subject: x86: Remove "x86 CPU features in debugfs" (CONFIG_X86_CPU_DEBUG) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CONFIG_X86_CPU_DEBUG, which provides some parsed versions of the x86 CPU configuration via debugfs, has caused boot failures on real hardware. The value of this feature has been marginal at best, as all this information is already available to userspace via generic interfaces. Causes crashes that have not been fixed + minimal utility -> remove. See the referenced LKML thread for more information. Reported-by: Ozan Çağlayan Signed-off-by: H. Peter Anvin LKML-Reference: Cc: Jaswinder Singh Rajput Cc: Linus Torvalds Cc: Rafael J. Wysocki Cc: Yinghai Lu Cc: --- arch/x86/kernel/cpu/Makefile | 2 - arch/x86/kernel/cpu/cpu_debug.c | 688 ---------------------------------------- 2 files changed, 690 deletions(-) delete mode 100644 arch/x86/kernel/cpu/cpu_debug.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 1d2cb383410e..c202b62f3671 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -19,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o -obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o - obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c deleted file mode 100644 index b368cd862997..000000000000 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ /dev/null @@ -1,688 +0,0 @@ -/* - * CPU x86 architecture debug code - * - * Copyright(C) 2009 Jaswinder Singh Rajput - * - * For licencing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpud_arr); -static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], cpud_priv_arr); -static DEFINE_PER_CPU(int, cpud_priv_count); - -static DEFINE_MUTEX(cpu_debug_lock); - -static struct dentry *cpu_debugfs_dir; - -static struct cpu_debug_base cpu_base[] = { - { "mc", CPU_MC, 0 }, - { "monitor", CPU_MONITOR, 0 }, - { "time", CPU_TIME, 0 }, - { "pmc", CPU_PMC, 1 }, - { "platform", CPU_PLATFORM, 0 }, - { "apic", CPU_APIC, 0 }, - { "poweron", CPU_POWERON, 0 }, - { "control", CPU_CONTROL, 0 }, - { "features", CPU_FEATURES, 0 }, - { "lastbranch", CPU_LBRANCH, 0 }, - { "bios", CPU_BIOS, 0 }, - { "freq", CPU_FREQ, 0 }, - { "mtrr", CPU_MTRR, 0 }, - { "perf", CPU_PERF, 0 }, - { "cache", CPU_CACHE, 0 }, - { "sysenter", CPU_SYSENTER, 0 }, - { "therm", CPU_THERM, 0 }, - { "misc", CPU_MISC, 0 }, - { "debug", CPU_DEBUG, 0 }, - { "pat", CPU_PAT, 0 }, - { "vmx", CPU_VMX, 0 }, - { "call", CPU_CALL, 0 }, - { "base", CPU_BASE, 0 }, - { "ver", CPU_VER, 0 }, - { "conf", CPU_CONF, 0 }, - { "smm", CPU_SMM, 0 }, - { "svm", CPU_SVM, 0 }, - { "osvm", CPU_OSVM, 0 }, - { "tss", CPU_TSS, 0 }, - { "cr", CPU_CR, 0 }, - { "dt", CPU_DT, 0 }, - { "registers", CPU_REG_ALL, 0 }, -}; - -static struct cpu_file_base cpu_file[] = { - { "index", CPU_REG_ALL, 0 }, - { "value", CPU_REG_ALL, 1 }, -}; - -/* CPU Registers Range */ -static struct cpu_debug_range cpu_reg_range[] = { - { 0x00000000, 0x00000001, CPU_MC, }, - { 0x00000006, 0x00000007, CPU_MONITOR, }, - { 0x00000010, 0x00000010, CPU_TIME, }, - { 0x00000011, 0x00000013, CPU_PMC, }, - { 0x00000017, 0x00000017, CPU_PLATFORM, }, - { 0x0000001B, 0x0000001B, CPU_APIC, }, - { 0x0000002A, 0x0000002B, CPU_POWERON, }, - { 0x0000002C, 0x0000002C, CPU_FREQ, }, - { 0x0000003A, 0x0000003A, CPU_CONTROL, }, - { 0x00000040, 0x00000047, CPU_LBRANCH, }, - { 0x00000060, 0x00000067, CPU_LBRANCH, }, - { 0x00000079, 0x00000079, CPU_BIOS, }, - { 0x00000088, 0x0000008A, CPU_CACHE, }, - { 0x0000008B, 0x0000008B, CPU_BIOS, }, - { 0x0000009B, 0x0000009B, CPU_MONITOR, }, - { 0x000000C1, 0x000000C4, CPU_PMC, }, - { 0x000000CD, 0x000000CD, CPU_FREQ, }, - { 0x000000E7, 0x000000E8, CPU_PERF, }, - { 0x000000FE, 0x000000FE, CPU_MTRR, }, - - { 0x00000116, 0x0000011E, CPU_CACHE, }, - { 0x00000174, 0x00000176, CPU_SYSENTER, }, - { 0x00000179, 0x0000017B, CPU_MC, }, - { 0x00000186, 0x00000189, CPU_PMC, }, - { 0x00000198, 0x00000199, CPU_PERF, }, - { 0x0000019A, 0x0000019A, CPU_TIME, }, - { 0x0000019B, 0x0000019D, CPU_THERM, }, - { 0x000001A0, 0x000001A0, CPU_MISC, }, - { 0x000001C9, 0x000001C9, CPU_LBRANCH, }, - { 0x000001D7, 0x000001D8, CPU_LBRANCH, }, - { 0x000001D9, 0x000001D9, CPU_DEBUG, }, - { 0x000001DA, 0x000001E0, CPU_LBRANCH, }, - - { 0x00000200, 0x0000020F, CPU_MTRR, }, - { 0x00000250, 0x00000250, CPU_MTRR, }, - { 0x00000258, 0x00000259, CPU_MTRR, }, - { 0x00000268, 0x0000026F, CPU_MTRR, }, - { 0x00000277, 0x00000277, CPU_PAT, }, - { 0x000002FF, 0x000002FF, CPU_MTRR, }, - - { 0x00000300, 0x00000311, CPU_PMC, }, - { 0x00000345, 0x00000345, CPU_PMC, }, - { 0x00000360, 0x00000371, CPU_PMC, }, - { 0x0000038D, 0x00000390, CPU_PMC, }, - { 0x000003A0, 0x000003BE, CPU_PMC, }, - { 0x000003C0, 0x000003CD, CPU_PMC, }, - { 0x000003E0, 0x000003E1, CPU_PMC, }, - { 0x000003F0, 0x000003F2, CPU_PMC, }, - - { 0x00000400, 0x00000417, CPU_MC, }, - { 0x00000480, 0x0000048B, CPU_VMX, }, - - { 0x00000600, 0x00000600, CPU_DEBUG, }, - { 0x00000680, 0x0000068F, CPU_LBRANCH, }, - { 0x000006C0, 0x000006CF, CPU_LBRANCH, }, - - { 0x000107CC, 0x000107D3, CPU_PMC, }, - - { 0xC0000080, 0xC0000080, CPU_FEATURES, }, - { 0xC0000081, 0xC0000084, CPU_CALL, }, - { 0xC0000100, 0xC0000102, CPU_BASE, }, - { 0xC0000103, 0xC0000103, CPU_TIME, }, - - { 0xC0010000, 0xC0010007, CPU_PMC, }, - { 0xC0010010, 0xC0010010, CPU_CONF, }, - { 0xC0010015, 0xC0010015, CPU_CONF, }, - { 0xC0010016, 0xC001001A, CPU_MTRR, }, - { 0xC001001D, 0xC001001D, CPU_MTRR, }, - { 0xC001001F, 0xC001001F, CPU_CONF, }, - { 0xC0010030, 0xC0010035, CPU_BIOS, }, - { 0xC0010044, 0xC0010048, CPU_MC, }, - { 0xC0010050, 0xC0010056, CPU_SMM, }, - { 0xC0010058, 0xC0010058, CPU_CONF, }, - { 0xC0010060, 0xC0010060, CPU_CACHE, }, - { 0xC0010061, 0xC0010068, CPU_SMM, }, - { 0xC0010069, 0xC001006B, CPU_SMM, }, - { 0xC0010070, 0xC0010071, CPU_SMM, }, - { 0xC0010111, 0xC0010113, CPU_SMM, }, - { 0xC0010114, 0xC0010118, CPU_SVM, }, - { 0xC0010140, 0xC0010141, CPU_OSVM, }, - { 0xC0011022, 0xC0011023, CPU_CONF, }, -}; - -static int is_typeflag_valid(unsigned cpu, unsigned flag) -{ - int i; - - /* Standard Registers should be always valid */ - if (flag >= CPU_TSS) - return 1; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (cpu_reg_range[i].flag == flag) - return 1; - } - - /* Invalid */ - return 0; -} - -static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max, - int index, unsigned flag) -{ - if (cpu_reg_range[index].flag == flag) { - *min = cpu_reg_range[index].min; - *max = cpu_reg_range[index].max; - } else - *max = 0; - - return *max; -} - -/* This function can also be called with seq = NULL for printk */ -static void print_cpu_data(struct seq_file *seq, unsigned type, - u32 low, u32 high) -{ - struct cpu_private *priv; - u64 val = high; - - if (seq) { - priv = seq->private; - if (priv->file) { - val = (val << 32) | low; - seq_printf(seq, "0x%llx\n", val); - } else - seq_printf(seq, " %08x: %08x_%08x\n", - type, high, low); - } else - printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low); -} - -/* This function can also be called with seq = NULL for printk */ -static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag) -{ - unsigned msr, msr_min, msr_max; - struct cpu_private *priv; - u32 low, high; - int i; - - if (seq) { - priv = seq->private; - if (priv->file) { - if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg, - &low, &high)) - print_cpu_data(seq, priv->reg, low, high); - return; - } - } - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag)) - continue; - - for (msr = msr_min; msr <= msr_max; msr++) { - if (rdmsr_safe_on_cpu(cpu, msr, &low, &high)) - continue; - print_cpu_data(seq, msr, low, high); - } - } -} - -static void print_tss(void *arg) -{ - struct pt_regs *regs = task_pt_regs(current); - struct seq_file *seq = arg; - unsigned int seg; - - seq_printf(seq, " RAX\t: %016lx\n", regs->ax); - seq_printf(seq, " RBX\t: %016lx\n", regs->bx); - seq_printf(seq, " RCX\t: %016lx\n", regs->cx); - seq_printf(seq, " RDX\t: %016lx\n", regs->dx); - - seq_printf(seq, " RSI\t: %016lx\n", regs->si); - seq_printf(seq, " RDI\t: %016lx\n", regs->di); - seq_printf(seq, " RBP\t: %016lx\n", regs->bp); - seq_printf(seq, " ESP\t: %016lx\n", regs->sp); - -#ifdef CONFIG_X86_64 - seq_printf(seq, " R08\t: %016lx\n", regs->r8); - seq_printf(seq, " R09\t: %016lx\n", regs->r9); - seq_printf(seq, " R10\t: %016lx\n", regs->r10); - seq_printf(seq, " R11\t: %016lx\n", regs->r11); - seq_printf(seq, " R12\t: %016lx\n", regs->r12); - seq_printf(seq, " R13\t: %016lx\n", regs->r13); - seq_printf(seq, " R14\t: %016lx\n", regs->r14); - seq_printf(seq, " R15\t: %016lx\n", regs->r15); -#endif - - asm("movl %%cs,%0" : "=r" (seg)); - seq_printf(seq, " CS\t: %04x\n", seg); - asm("movl %%ds,%0" : "=r" (seg)); - seq_printf(seq, " DS\t: %04x\n", seg); - seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff); - asm("movl %%es,%0" : "=r" (seg)); - seq_printf(seq, " ES\t: %04x\n", seg); - asm("movl %%fs,%0" : "=r" (seg)); - seq_printf(seq, " FS\t: %04x\n", seg); - asm("movl %%gs,%0" : "=r" (seg)); - seq_printf(seq, " GS\t: %04x\n", seg); - - seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags); - - seq_printf(seq, " EIP\t: %016lx\n", regs->ip); -} - -static void print_cr(void *arg) -{ - struct seq_file *seq = arg; - - seq_printf(seq, " cr0\t: %016lx\n", read_cr0()); - seq_printf(seq, " cr2\t: %016lx\n", read_cr2()); - seq_printf(seq, " cr3\t: %016lx\n", read_cr3()); - seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe()); -#ifdef CONFIG_X86_64 - seq_printf(seq, " cr8\t: %016lx\n", read_cr8()); -#endif -} - -static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt) -{ - seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size)); -} - -static void print_dt(void *seq) -{ - struct desc_ptr dt; - unsigned long ldt; - - /* IDT */ - store_idt((struct desc_ptr *)&dt); - print_desc_ptr("IDT", seq, dt); - - /* GDT */ - store_gdt((struct desc_ptr *)&dt); - print_desc_ptr("GDT", seq, dt); - - /* LDT */ - store_ldt(ldt); - seq_printf(seq, " LDT\t: %016lx\n", ldt); - - /* TR */ - store_tr(ldt); - seq_printf(seq, " TR\t: %016lx\n", ldt); -} - -static void print_dr(void *arg) -{ - struct seq_file *seq = arg; - unsigned long dr; - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - get_debugreg(dr, i); - seq_printf(seq, " dr%d\t: %016lx\n", i, dr); - } - - seq_printf(seq, "\n MSR\t:\n"); -} - -static void print_apic(void *arg) -{ - struct seq_file *seq = arg; - -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(seq, " LAPIC\t:\n"); - seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24); - seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR)); - seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI)); - seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI)); - seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI)); - seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR)); - seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR)); - seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV)); - seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR)); - seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR)); - seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR)); - seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2)); - seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT)); - seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR)); - seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC)); - seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0)); - seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1)); - seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR)); - seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT)); - seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT)); - seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR)); - if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { - unsigned int i, v, maxeilvt; - - v = apic_read(APIC_EFEAT); - maxeilvt = (v >> 16) & 0xff; - seq_printf(seq, " EFEAT\t\t: %08x\n", v); - seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL)); - - for (i = 0; i < maxeilvt; i++) { - v = apic_read(APIC_EILVTn(i)); - seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v); - } - } -#endif /* CONFIG_X86_LOCAL_APIC */ - seq_printf(seq, "\n MSR\t:\n"); -} - -static int cpu_seq_show(struct seq_file *seq, void *v) -{ - struct cpu_private *priv = seq->private; - - if (priv == NULL) - return -EINVAL; - - switch (cpu_base[priv->type].flag) { - case CPU_TSS: - smp_call_function_single(priv->cpu, print_tss, seq, 1); - break; - case CPU_CR: - smp_call_function_single(priv->cpu, print_cr, seq, 1); - break; - case CPU_DT: - smp_call_function_single(priv->cpu, print_dt, seq, 1); - break; - case CPU_DEBUG: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_dr, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - case CPU_APIC: - if (priv->file == CPU_INDEX_BIT) - smp_call_function_single(priv->cpu, print_apic, seq, 1); - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - - default: - print_msr(seq, priv->cpu, cpu_base[priv->type].flag); - break; - } - seq_printf(seq, "\n"); - - return 0; -} - -static void *cpu_seq_start(struct seq_file *seq, loff_t *pos) -{ - if (*pos == 0) /* One time is enough ;-) */ - return seq; - - return NULL; -} - -static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - (*pos)++; - - return cpu_seq_start(seq, pos); -} - -static void cpu_seq_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations cpu_seq_ops = { - .start = cpu_seq_start, - .next = cpu_seq_next, - .stop = cpu_seq_stop, - .show = cpu_seq_show, -}; - -static int cpu_seq_open(struct inode *inode, struct file *file) -{ - struct cpu_private *priv = inode->i_private; - struct seq_file *seq; - int err; - - err = seq_open(file, &cpu_seq_ops); - if (!err) { - seq = file->private_data; - seq->private = priv; - } - - return err; -} - -static int write_msr(struct cpu_private *priv, u64 val) -{ - u32 low, high; - - high = (val >> 32) & 0xffffffff; - low = val & 0xffffffff; - - if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high)) - return 0; - - return -EPERM; -} - -static int write_cpu_register(struct cpu_private *priv, const char *buf) -{ - int ret = -EPERM; - u64 val; - - ret = strict_strtoull(buf, 0, &val); - if (ret < 0) - return ret; - - /* Supporting only MSRs */ - if (priv->type < CPU_TSS_BIT) - return write_msr(priv, val); - - return ret; -} - -static ssize_t cpu_write(struct file *file, const char __user *ubuf, - size_t count, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct cpu_private *priv = seq->private; - char buf[19]; - - if ((priv == NULL) || (count >= sizeof(buf))) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, count)) - return -EFAULT; - - buf[count] = 0; - - if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write)) - if (!write_cpu_register(priv, buf)) - return count; - - return -EACCES; -} - -static const struct file_operations cpu_fops = { - .owner = THIS_MODULE, - .open = cpu_seq_open, - .read = seq_read, - .write = cpu_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg, - unsigned file, struct dentry *dentry) -{ - struct cpu_private *priv = NULL; - - /* Already intialized */ - if (file == CPU_INDEX_BIT) - if (per_cpu(cpud_arr[type].init, cpu)) - return 0; - - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv == NULL) - return -ENOMEM; - - priv->cpu = cpu; - priv->type = type; - priv->reg = reg; - priv->file = file; - mutex_lock(&cpu_debug_lock); - per_cpu(cpud_priv_arr[type], cpu) = priv; - per_cpu(cpud_priv_count, cpu)++; - mutex_unlock(&cpu_debug_lock); - - if (file) - debugfs_create_file(cpu_file[file].name, S_IRUGO, - dentry, (void *)priv, &cpu_fops); - else { - debugfs_create_file(cpu_base[type].name, S_IRUGO, - per_cpu(cpud_arr[type].dentry, cpu), - (void *)priv, &cpu_fops); - mutex_lock(&cpu_debug_lock); - per_cpu(cpud_arr[type].init, cpu) = 1; - mutex_unlock(&cpu_debug_lock); - } - - return 0; -} - -static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg, - struct dentry *dentry) -{ - unsigned file; - int err = 0; - - for (file = 0; file < ARRAY_SIZE(cpu_file); file++) { - err = cpu_create_file(cpu, type, reg, file, dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned reg, reg_min, reg_max; - int i, err = 0; - char reg_dir[12]; - u32 low, high; - - for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) { - if (!get_cpu_range(cpu, ®_min, ®_max, i, - cpu_base[type].flag)) - continue; - - for (reg = reg_min; reg <= reg_max; reg++) { - if (rdmsr_safe_on_cpu(cpu, reg, &low, &high)) - continue; - - sprintf(reg_dir, "0x%x", reg); - cpu_dentry = debugfs_create_dir(reg_dir, dentry); - err = cpu_init_regfiles(cpu, type, reg, cpu_dentry); - if (err) - return err; - } - } - - return err; -} - -static int cpu_init_allreg(unsigned cpu, struct dentry *dentry) -{ - struct dentry *cpu_dentry = NULL; - unsigned type; - int err = 0; - - for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) { - if (!is_typeflag_valid(cpu, cpu_base[type].flag)) - continue; - cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry); - per_cpu(cpud_arr[type].dentry, cpu) = cpu_dentry; - - if (type < CPU_TSS_BIT) - err = cpu_init_msr(cpu, type, cpu_dentry); - else - err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT, - cpu_dentry); - if (err) - return err; - } - - return err; -} - -static int cpu_init_cpu(void) -{ - struct dentry *cpu_dentry = NULL; - struct cpuinfo_x86 *cpui; - char cpu_dir[12]; - unsigned cpu; - int err = 0; - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - cpui = &cpu_data(cpu); - if (!cpu_has(cpui, X86_FEATURE_MSR)) - continue; - - sprintf(cpu_dir, "cpu%d", cpu); - cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir); - err = cpu_init_allreg(cpu, cpu_dentry); - - pr_info("cpu%d(%d) debug files %d\n", - cpu, nr_cpu_ids, per_cpu(cpud_priv_count, cpu)); - if (per_cpu(cpud_priv_count, cpu) > MAX_CPU_FILES) { - pr_err("Register files count %d exceeds limit %d\n", - per_cpu(cpud_priv_count, cpu), MAX_CPU_FILES); - per_cpu(cpud_priv_count, cpu) = MAX_CPU_FILES; - err = -ENFILE; - } - if (err) - return err; - } - - return err; -} - -static int __init cpu_debug_init(void) -{ - cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir); - - return cpu_init_cpu(); -} - -static void __exit cpu_debug_exit(void) -{ - int i, cpu; - - if (cpu_debugfs_dir) - debugfs_remove_recursive(cpu_debugfs_dir); - - for (cpu = 0; cpu < nr_cpu_ids; cpu++) - for (i = 0; i < per_cpu(cpud_priv_count, cpu); i++) - kfree(per_cpu(cpud_priv_arr[i], cpu)); -} - -module_init(cpu_debug_init); -module_exit(cpu_debug_exit); - -MODULE_AUTHOR("Jaswinder Singh Rajput"); -MODULE_DESCRIPTION("CPU Debug module"); -MODULE_LICENSE("GPL"); -- cgit v1.2.1 From da482474b8396e1a099c37ffc6541b78775aedb4 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Tue, 26 Jan 2010 20:37:22 -0600 Subject: x86, msr/cpuid: Pass the number of minors when unregistering MSR and CPUID drivers. Pass the number of minors when unregistering MSR and CPUID drivers. Reported-by: Dean Nelson Signed-off-by: Dean Nelson LKML-Reference: <20100127023722.GA22305@sgi.com> Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpuid.c | 2 +- arch/x86/kernel/msr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index cb27fd6136c9..83e5e628de73 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -229,7 +229,7 @@ static void __exit cpuid_exit(void) for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 4bd93c9b2b27..206735ac8cbd 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -285,7 +285,7 @@ static void __exit msr_exit(void) for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); } -- cgit v1.2.1 From aca3bb5910119d4cf6c28568a642582efb4cc14a Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Fri, 22 Jan 2010 09:41:40 -0600 Subject: x86, UV: Fix RTC latency bug by reading replicated cachelines For SGI UV node controllers (HUB) rev 2.0 or greater, use replicated cachelines to read the RTC timer. This optimization allows faster simulataneous reads from a given socket. Signed-off-by: Dimitri Sivanich Cc: Jack Steiner LKML-Reference: <20100122154140.GB4975@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_time.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 3c84aa001c11..2b75ef638dbc 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -282,10 +282,21 @@ static int uv_rtc_unset_timer(int cpu, int force) /* * Read the RTC. + * + * Starting with HUB rev 2.0, the UV RTC register is replicated across all + * cachelines of it's own page. This allows faster simultaneous reads + * from a given socket. */ static cycle_t uv_read_rtc(struct clocksource *cs) { - return (cycle_t)uv_read_local_mmr(UVH_RTC); + unsigned long offset; + + if (uv_get_min_hub_revision_id() == 1) + offset = 0; + else + offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE; + + return (cycle_t)uv_read_local_mmr(UVH_RTC | offset); } /* -- cgit v1.2.1 From 35ea63d70f827a26c150993b4b940925bb02b03f Mon Sep 17 00:00:00 2001 From: Leann Ogasawara Date: Wed, 27 Jan 2010 15:29:18 -0800 Subject: x86: Add Dell OptiPlex 760 reboot quirk Dell OptiPlex 760 hangs on reboot unless reboot=bios is used. Add quirk to reboot through the BIOS. BugLink: https://bugs.launchpad.net/bugs/488319 Signed-off-by: Leann Ogasawara LKML-Reference: <1264634958.27335.1091.camel@emiko> Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 1545bc0c9845..704bddcdf64d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 760", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), + DMI_MATCH(DMI_BOARD_NAME, "0G919G"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", -- cgit v1.2.1 From 05d43ed8a89c159ff641d472f970e3f1baa66318 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 28 Jan 2010 22:14:43 -0800 Subject: x86: get rid of the insane TIF_ABI_PENDING bit Now that the previous commit made it possible to do the personality setting at the point of no return, we do just that for ELF binaries. And suddenly all the reasons for that insane TIF_ABI_PENDING bit go away, and we can just make SET_PERSONALITY() just do the obvious thing for a 32-bit compat process. Everything becomes much more straightforward this way. Signed-off-by: H. Peter Anvin Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/process.c | 12 ------------ arch/x86/kernel/process_64.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 02c3ee013ccd..c9b3522b6b46 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -115,18 +115,6 @@ void flush_thread(void) { struct task_struct *tsk = current; -#ifdef CONFIG_X86_64 - if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { - clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - clear_tsk_thread_flag(tsk, TIF_IA32); - } else { - set_tsk_thread_flag(tsk, TIF_IA32); - current_thread_info()->status |= TS_COMPAT; - } - } -#endif - flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f9e033150cdf..41a26a82470a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -521,6 +521,17 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } +void set_personality_ia32(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_IA32); + + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; +} + unsigned long get_wchan(struct task_struct *p) { unsigned long stack; -- cgit v1.2.1 From 7c099ce1575126395f186ecf58b51a60d5c3be7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20H=C3=A4rdeman?= Date: Thu, 28 Jan 2010 21:02:54 +0100 Subject: x86: Add quirk for Intel DG45FC board to avoid low memory corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 6aa542a694dc9ea4344a8a590d2628c33d1b9431 added a quirk for the Intel DG45ID board due to low memory corruption. The Intel DG45FC shares the same BIOS (and the same bug) as noted in: http://bugzilla.kernel.org/show_bug.cgi?id=13736 Signed-off-by: David Härdeman LKML-Reference: <20100128200254.GA9134@hardeman.nu> Cc: Cc: Alexey Fisher Cc: ykzhao Cc: Tony Bones Cc: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f7b8b9894b22..5d9e40c58628 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -642,19 +642,27 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), }, }, - { /* - * AMI BIOS with low memory corruption was found on Intel DG45ID board. - * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * AMI BIOS with low memory corruption was found on Intel DG45ID and + * DG45FC boards. + * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will * match only DMI_BOARD_NAME and see if there is more bad products * with this vendor. */ + { .callback = dmi_low_memory_corruption, .ident = "AMI BIOS", .matches = { DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), + }, + }, #endif {} }; -- cgit v1.2.1 From cc0967490c1c3824bc5b75718b6ca8a51d9f2617 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 28 Jan 2010 17:04:42 -0600 Subject: x86, hw_breakpoints, kgdb: Fix kgdb to use hw_breakpoint API In the 2.6.33 kernel, the hw_breakpoint API is now used for the performance event counters. The hw_breakpoint_handler() now consumes the hw breakpoints that were previously set by kgdb arch specific code. In order for kgdb to work in conjunction with this core API change, kgdb must use some of the low level functions of the hw_breakpoint API to install, uninstall, and deal with hw breakpoint reservations. The kgdb core required a change to call kgdb_disable_hw_debug anytime a slave cpu enters kgdb_wait() in order to keep all the hw breakpoints in sync as well as to prevent hitting a hw breakpoint while kgdb is active. During the architecture specific initialization of kgdb, it will pre-allocate 4 disabled (struct perf event **) structures. Kgdb will use these to manage the capabilities for the 4 hw breakpoint registers, per cpu. Right now the hw_breakpoint API does not have a way to ask how many breakpoints are available, on each CPU so it is possible that the install of a breakpoint might fail when kgdb restores the system to the run state. The intent of this patch is to first get the basic functionality of hw breakpoints working and leave it to the person debugging the kernel to understand what hw breakpoints are in use and what restrictions have been imposed as a result. Breakpoint constraints will be dealt with in a future patch. While atomic, the x86 specific kgdb code will call arch_uninstall_hw_breakpoint() and arch_install_hw_breakpoint() to manage the cpu specific hw breakpoints. The net result of these changes allow kgdb to use the same pool of hw_breakpoints that are used by the perf event API, but neither knows about future reservations for the available hw breakpoint slots. Signed-off-by: Jason Wessel Acked-by: Frederic Weisbecker Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad Cc: Peter Zijlstra Cc: Alan Stern Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 171 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 114 insertions(+), 57 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index dd74fe7273b1..62bea7307eaa 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -204,40 +205,38 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) static struct hw_breakpoint { unsigned enabled; - unsigned type; - unsigned len; unsigned long addr; + int len; + int type; + struct perf_event **pev; } breakinfo[4]; static void kgdb_correct_hw_break(void) { - unsigned long dr7; - int correctit = 0; - int breakbit; int breakno; - get_debugreg(dr7, 7); for (breakno = 0; breakno < 4; breakno++) { - breakbit = 2 << (breakno << 1); - if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { - correctit = 1; - dr7 |= breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - dr7 |= ((breakinfo[breakno].len << 2) | - breakinfo[breakno].type) << - ((breakno << 2) + 16); - set_debugreg(breakinfo[breakno].addr, breakno); - - } else { - if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { - correctit = 1; - dr7 &= ~breakbit; - dr7 &= ~(0xf0000 << (breakno << 2)); - } - } + struct perf_event *bp; + struct arch_hw_breakpoint *info; + int val; + int cpu = raw_smp_processor_id(); + if (!breakinfo[breakno].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); + info = counter_arch_bp(bp); + if (bp->attr.disabled != 1) + continue; + bp->attr.bp_addr = breakinfo[breakno].addr; + bp->attr.bp_len = breakinfo[breakno].len; + bp->attr.bp_type = breakinfo[breakno].type; + info->address = breakinfo[breakno].addr; + info->len = breakinfo[breakno].len; + info->type = breakinfo[breakno].type; + val = arch_install_hw_breakpoint(bp); + if (!val) + bp->attr.disabled = 0; } - if (correctit) - set_debugreg(dr7, 7); + hw_breakpoint_restore(); } static int @@ -259,15 +258,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) static void kgdb_remove_all_hw_break(void) { int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; - for (i = 0; i < 4; i++) - memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } static int kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { - unsigned type; int i; for (i = 0; i < 4; i++) @@ -278,27 +285,38 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) switch (bptype) { case BP_HARDWARE_BREAKPOINT: - type = 0; - len = 1; + len = 1; + breakinfo[i].type = X86_BREAKPOINT_EXECUTE; break; case BP_WRITE_WATCHPOINT: - type = 1; + breakinfo[i].type = X86_BREAKPOINT_WRITE; break; case BP_ACCESS_WATCHPOINT: - type = 3; + breakinfo[i].type = X86_BREAKPOINT_RW; break; default: return -1; } - - if (len == 1 || len == 2 || len == 4) - breakinfo[i].len = len - 1; - else + switch (len) { + case 1: + breakinfo[i].len = X86_BREAKPOINT_LEN_1; + break; + case 2: + breakinfo[i].len = X86_BREAKPOINT_LEN_2; + break; + case 4: + breakinfo[i].len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case 8: + breakinfo[i].len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: return -1; - - breakinfo[i].enabled = 1; + } breakinfo[i].addr = addr; - breakinfo[i].type = type; + breakinfo[i].enabled = 1; return 0; } @@ -313,8 +331,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) */ void kgdb_disable_hw_debug(struct pt_regs *regs) { + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + /* Disable hardware debugging while we are in kgdb: */ set_debugreg(0UL, 7); + for (i = 0; i < 4; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } } /** @@ -378,7 +409,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, struct pt_regs *linux_regs) { unsigned long addr; - unsigned long dr6; char *ptr; int newPC; @@ -404,20 +434,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, raw_smp_processor_id()); } - get_debugreg(dr6, 6); - if (!(dr6 & 0x4000)) { - int breakno; - - for (breakno = 0; breakno < 4; breakno++) { - if (dr6 & (1 << breakno) && - breakinfo[breakno].type == 0) { - /* Set restore flag: */ - linux_regs->flags |= X86_EFLAGS_RF; - break; - } - } - } - set_debugreg(0UL, 6); kgdb_correct_hw_break(); return 0; @@ -485,8 +501,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) break; case DIE_DEBUG: - if (atomic_read(&kgdb_cpu_doing_single_step) == - raw_smp_processor_id()) { + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { if (user_mode(regs)) return single_step_cont(regs, args); break; @@ -539,7 +554,42 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { - return register_die_notifier(&kgdb_notifier); + int i, cpu; + int ret; + struct perf_event_attr attr; + struct perf_event **pevent; + + ret = register_die_notifier(&kgdb_notifier); + if (ret != 0) + return ret; + /* + * Pre-allocate the hw breakpoint structions in the non-atomic + * portion of kgdb because this operation requires mutexs to + * complete. + */ + attr.bp_addr = (unsigned long)kgdb_arch_init; + attr.type = PERF_TYPE_BREAKPOINT; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + for (i = 0; i < 4; i++) { + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); + if (IS_ERR(breakinfo[i].pev)) { + printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n"); + breakinfo[i].pev = NULL; + kgdb_arch_exit(); + return -1; + } + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[i].pev, cpu); + pevent[0]->hw.sample_period = 1; + if (pevent[0]->destroy != NULL) { + pevent[0]->destroy = NULL; + release_bp_slot(*pevent); + } + } + } + return ret; } /** @@ -550,6 +600,13 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { + int i; + for (i = 0; i < 4; i++) { + if (breakinfo[i].pev) { + unregister_wide_hw_breakpoint(breakinfo[i].pev); + breakinfo[i].pev = NULL; + } + } unregister_die_notifier(&kgdb_notifier); } -- cgit v1.2.1 From 5352ae638e2d7d5c9b2e4d528676bbf2af6fd6f3 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 28 Jan 2010 17:04:43 -0600 Subject: perf, hw_breakpoint, kgdb: Do not take mutex for kernel debugger This patch fixes the regression in functionality where the kernel debugger and the perf API do not nicely share hw breakpoint reservations. The kernel debugger cannot use any mutex_lock() calls because it can start the kernel running from an invalid context. A mutex free version of the reservation API needed to get created for the kernel debugger to safely update hw breakpoint reservations. The possibility for a breakpoint reservation to be concurrently processed at the time that kgdb interrupts the system is improbable. Should this corner case occur the end user is warned, and the kernel debugger will prohibit updating the hardware breakpoint reservations. Any time the kernel debugger reserves a hardware breakpoint it will be a system wide reservation. Signed-off-by: Jason Wessel Acked-by: Frederic Weisbecker Cc: kgdb-bugreport@lists.sourceforge.net Cc: K.Prasad Cc: Peter Zijlstra Cc: Alan Stern Cc: torvalds@linux-foundation.org LKML-Reference: <1264719883-7285-3-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kgdb.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 62bea7307eaa..bfba6019d762 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -239,6 +239,49 @@ static void kgdb_correct_hw_break(void) hw_breakpoint_restore(); } +static int hw_break_reserve_slot(int breakno) +{ + int cpu; + int cnt = 0; + struct perf_event **pevent; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); + } + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + struct perf_event **pevent; + int cpu; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responisble for handing the retry on + * remove failure. + */ + return -1; + } + return 0; +} + static int kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { @@ -250,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) if (i == 4) return -1; + if (hw_break_release_slot(i)) { + printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } breakinfo[i].enabled = 0; return 0; @@ -316,6 +363,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) return -1; } breakinfo[i].addr = addr; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } breakinfo[i].enabled = 1; return 0; -- cgit v1.2.1 From 681ee44d40d7c93b42118320e4620d07d8704fd6 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 9 Feb 2010 18:01:44 -0800 Subject: x86, apic: Don't use logical-flat mode when CPU hotplug may exceed 8 CPUs We need to fall back from logical-flat APIC mode to physical-flat mode when we have more than 8 CPUs. However, in the presence of CPU hotplug(with bios listing not enabled but possible cpus as disabled cpus in MADT), we have to consider the number of possible CPUs rather than the number of current CPUs; otherwise we may cross the 8-CPU boundary when CPUs are added later. 32bit apic code can use more cleanups (like the removal of vendor checks in 32bit default_setup_apic_routing()) and more unifications with 64bit code. Yinghai has some patches in works already. This patch addresses the boot issue that is reported in the virtualization guest context. [ hpa: incorporated function annotation feedback from Yinghai Lu ] Signed-off-by: Suresh Siddha LKML-Reference: <1265767304.2833.19.camel@sbs-t61.sc.intel.com> Acked-by: Shaohui Zheng Reviewed-by: Yinghai Lu Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/boot.c | 5 ----- arch/x86/kernel/apic/apic.c | 17 ----------------- arch/x86/kernel/apic/probe_32.c | 29 +++++++++++++++++++++++++++-- arch/x86/kernel/apic/probe_64.c | 2 +- arch/x86/kernel/mpparse.c | 7 ------- arch/x86/kernel/smpboot.c | 2 -- 6 files changed, 28 insertions(+), 34 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 036d28adf59d..0acbcdfa5ca4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1185,9 +1185,6 @@ static void __init acpi_process_madt(void) if (!error) { acpi_lapic = 1; -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif /* * Parse MADT IO-APIC entries */ @@ -1197,8 +1194,6 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; - if (apic->setup_apic_routing) - apic->setup_apic_routing(); } } if (error == -EINVAL) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 3987e4408f75..dfca210f6a10 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1641,9 +1641,7 @@ int __init APIC_init_uniprocessor(void) #endif enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif verify_local_APIC(); connect_bsp_APIC(); @@ -1891,21 +1889,6 @@ void __cpuinit generic_processor_info(int apicid, int version) if (apicid > max_physical_apicid) max_physical_apicid = apicid; -#ifdef CONFIG_X86_32 - if (num_processors > 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } -#endif - #if defined(CONFIG_SMP) || defined(CONFIG_X86_64) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1a6559f6768c..99d2fe016084 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -52,7 +52,32 @@ static int __init print_ipi_mode(void) } late_initcall(print_ipi_mode); -void default_setup_apic_routing(void) +void __init default_setup_apic_routing(void) +{ + int version = apic_version[boot_cpu_physical_apicid]; + + if (num_possible_cpus() > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + +#ifdef CONFIG_X86_BIGSMP + generic_bigsmp_probe(); +#endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); +} + +static void setup_apic_flat_routing(void) { #ifdef CONFIG_X86_IO_APIC printk(KERN_INFO @@ -103,7 +128,7 @@ struct apic apic_default = { .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = default_setup_apic_routing, + .setup_apic_routing = setup_apic_flat_routing, .multi_timer_check = NULL, .apicid_to_node = default_apicid_to_node, .cpu_to_logical_apicid = default_cpu_to_logical_apicid, diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 450fe2064a14..83e9be4778e2 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -67,7 +67,7 @@ void __init default_setup_apic_routing(void) } #endif - if (apic == &apic_flat && num_processors > 8) + if (apic == &apic_flat && num_possible_cpus() > 8) apic = &apic_physflat; printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 40b54ceb68b5..a2c1edd2d3ac 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) x86_init.mpparse.mpc_record(1); } -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); - if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f3..b4e870cbdc60 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1083,9 +1083,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_cpu_sibling_map(0); enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); -- cgit v1.2.1 From 1252f238db48ec419f40c1bdf30fda649860eed9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 16 Feb 2010 15:02:13 +0100 Subject: x86: set_personality_ia32() misses force_personality32 05d43ed8a "x86: get rid of the insane TIF_ABI_PENDING bit" forgot about force_personality32. Fix. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- arch/x86/kernel/process_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 41a26a82470a..126f0b493d04 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -527,6 +527,7 @@ void set_personality_ia32(void) /* Make sure to be in 32bit mode */ set_thread_flag(TIF_IA32); + current->personality |= force_personality32; /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; -- cgit v1.2.1