From e40b17208b6805be50ffe891878662b6076206b9 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:03 -0500 Subject: x86: Move notify_die from nmi.c to traps.c In order to handle a new nmi_watchdog approach, I need to move the notify_die() routine out of nmi_watchdog_tick() and into default_do_nmi(). This lets me easily swap out the old nmi_watchdog with the new one with just a config change. The change probably makes sense from a high level perspective because the nmi_watchdog shouldn't be handling notify_die routines anyway. However, this move does change the semantics a little bit. Instead of checking on every nmi interrupt if the cpus are stuck, only check them on the nmi_watchdog interrupts. v2: Move notify_die call into #idef block Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-2-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/nmi.c | 7 ------- arch/x86/kernel/traps.c | 5 +++++ 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 0159a69396cb..5d47682f580b 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -400,13 +400,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) int cpu = smp_processor_id(); int rc = 0; - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - sum = get_timer_irqs(cpu); if (__get_cpu_var(nmi_touch)) { diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1168e4454188..51ef893ffa65 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -400,7 +400,12 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; + #ifdef CONFIG_X86_LOCAL_APIC + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; + /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. -- cgit v1.2.1 From 1fb9d6ad2766a1dd70d167552988375049a97f21 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:04 -0500 Subject: nmi_watchdog: Add new, generic implementation, using perf events This is a new generic nmi_watchdog implementation using the perf events infrastructure as suggested by Ingo. The implementation is simple, just create an in-kernel perf event and register an overflow handler to check for cpu lockups. I created a generic implementation that lives in kernel/ and the hardware specific part that for now lives in arch/x86. This approach has a number of advantages: - It simplifies the x86 PMU implementation in the long run, in that it removes the hardcoded low-level PMU implementation that was the NMI watchdog before. - It allows new NMI watchdog features to be added in a central place. - It allows other architectures to enable the NMI watchdog, as long as they have perf events (that provide NMIs) implemented. - It also allows for more graceful co-existence of existing perf events apps and the NMI watchdog - before these changes the relationship was exclusive. (The NMI watchdog will 'spend' a perf event when enabled. In later iterations we might be able to piggyback from an existing NMI event without having to allocate a hardware event for the NMI watchdog - turning this into a no-hardware-cost feature.) As for compatibility, we'll keep the old NMI watchdog code as well until the new one can 100% replace it on all CPUs, old and new alike. That might take some time as the NMI watchdog has been ported to many CPU models. I have done light testing to make sure the framework works correctly and it does. v2: Set the correct timeout values based on the old nmi watchdog Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-3-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 114 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 arch/x86/kernel/apic/hw_nmi.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c new file mode 100644 index 000000000000..8c0e6a410d05 --- /dev/null +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -0,0 +1,114 @@ +/* + * HW NMI watchdog support + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Arch specific calls to support NMI watchdog + * + * Bits copied from original nmi.c file + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; + +static DEFINE_PER_CPU(unsigned, last_irq_sum); + +/* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ +static inline unsigned int get_timer_irqs(int cpu) +{ + return per_cpu(irq_stat, cpu).apic_timer_irqs + + per_cpu(irq_stat, cpu).irq0_irqs; +} + +static inline int mce_in_progress(void) +{ +#if defined(CONFIG_X86_MCE) + return atomic_read(&mce_entry) > 0; +#endif + return 0; +} + +int hw_nmi_is_cpu_stuck(struct pt_regs *regs) +{ + unsigned int sum; + int cpu = smp_processor_id(); + + /* FIXME: cheap hack for this check, probably should get its own + * die_notifier handler + */ + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); + dump_stack(); + spin_unlock(&lock); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + } + + /* if we are doing an mce, just assume the cpu is not stuck */ + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + return 0; + + /* We determine if the cpu is stuck by checking whether any + * interrupts have happened since we last checked. Of course + * an nmi storm could create false positives, but the higher + * level logic should account for that + */ + sum = get_timer_irqs(cpu); + if (__get_cpu_var(last_irq_sum) == sum) { + return 1; + } else { + __get_cpu_var(last_irq_sum) = sum; + return 0; + } +} + +void arch_trigger_all_cpu_backtrace(void) +{ + int i; + + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpumask_empty(to_cpumask(backtrace_mask))) + break; + mdelay(1); + } +} + +/* STUB calls to mimic old nmi_watchdog behaviour */ +unsigned int nmi_watchdog = NMI_NONE; +EXPORT_SYMBOL(nmi_watchdog); +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +EXPORT_SYMBOL(nmi_active); +int nmi_watchdog_enabled; +int unknown_nmi_panic; +void cpu_nmi_set_wd_enabled(void) { return; } +void acpi_nmi_enable(void) { return; } +void acpi_nmi_disable(void) { return; } +void stop_apic_nmi_watchdog(void *unused) { return; } +void setup_apic_nmi_watchdog(void *unused) { return; } +int __init check_nmi_watchdog(void) { return 0; } -- cgit v1.2.1 From 84e478c6f1eb9c4bfa1fff2f8108e9a061b46428 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 5 Feb 2010 21:47:05 -0500 Subject: nmi_watchdog: Config option to enable new nmi_watchdog These are the bits that enable the new nmi_watchdog and safely isolate the old nmi_watchdog. Only one or the other can run, not both at the same time. Signed-off-by: Don Zickus Cc: Linus Torvalds Cc: Andrew Morton Cc: gorcunov@gmail.com Cc: aris@redhat.com Cc: peterz@infradead.org LKML-Reference: <1265424425-31562-4-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/Makefile | 7 ++++++- arch/x86/kernel/traps.c | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 565c1bfc507d..1a4512e48d24 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,7 +2,12 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o +ifneq ($(CONFIG_NMI_WATCHDOG),y) +obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o +endif +obj-$(CONFIG_NMI_WATCHDOG) += hw_nmi.o + obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 51ef893ffa65..973cbc4f044f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -406,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) == NOTIFY_STOP) return; +#ifndef CONFIG_NMI_WATCHDOG /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -413,6 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs, reason)) return; if (!do_nmi_callback(regs, cpu)) +#endif /* !CONFIG_NMI_WATCHDOG */ unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); -- cgit v1.2.1 From 504d7cf10ee42bb76b9556859f23d4121dee0a77 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 12 Feb 2010 17:19:19 -0500 Subject: nmi_watchdog: Compile and portability fixes The original patch was x86_64 centric. Changed the code to make it less so. ested by building and running on a powerpc. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266013161-31197-2-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 8c0e6a410d05..312d772c5c35 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -32,8 +32,13 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum); */ static inline unsigned int get_timer_irqs(int cpu) { - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; + unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs; + +#if defined(CONFIG_X86_LOCAL_APIC) + irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; +#endif + + return irqs; } static inline int mce_in_progress(void) @@ -82,6 +87,11 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) } } +u64 hw_nmi_get_sample_period(void) +{ + return cpu_khz * 1000; +} + void arch_trigger_all_cpu_backtrace(void) { int i; @@ -100,15 +110,16 @@ void arch_trigger_all_cpu_backtrace(void) } /* STUB calls to mimic old nmi_watchdog behaviour */ +#if defined(CONFIG_X86_LOCAL_APIC) unsigned int nmi_watchdog = NMI_NONE; EXPORT_SYMBOL(nmi_watchdog); +void acpi_nmi_enable(void) { return; } +void acpi_nmi_disable(void) { return; } +#endif atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ EXPORT_SYMBOL(nmi_active); -int nmi_watchdog_enabled; int unknown_nmi_panic; void cpu_nmi_set_wd_enabled(void) { return; } -void acpi_nmi_enable(void) { return; } -void acpi_nmi_disable(void) { return; } void stop_apic_nmi_watchdog(void *unused) { return; } void setup_apic_nmi_watchdog(void *unused) { return; } int __init check_nmi_watchdog(void) { return 0; } -- cgit v1.2.1 From 2cc4452bc31fc1cde6f0b64a4eb13269f982787d Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 18 Feb 2010 21:56:52 -0500 Subject: nmi_watchdog: Fix undefined 'apic' build bug Ingo provided me a config that fails to compile with: arch/x86/built-in.o: In function `arch_trigger_all_cpu_backtrace': (.text+0x17e78): undefined reference to `apic' make: *** [.tmp_vmlinux1] Error 1 I realized I changed the compile behaviour of the nmi code by not wrapping it with CONFIG_LOCAL_APIC. To fix this I add a compile check for ARCH_HAS_NMI_WATCHDOG around arch_trigger_all_cpu_backtrace. Signed-off-by: Don Zickus Cc: a.p.zijlstra@chello.nl Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266548212-24243-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 312d772c5c35..0b4d205a6b8e 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -92,6 +92,7 @@ u64 hw_nmi_get_sample_period(void) return cpu_khz * 1000; } +#ifdef ARCH_HAS_NMI_WATCHDOG void arch_trigger_all_cpu_backtrace(void) { int i; @@ -108,6 +109,7 @@ void arch_trigger_all_cpu_backtrace(void) mdelay(1); } } +#endif /* STUB calls to mimic old nmi_watchdog behaviour */ #if defined(CONFIG_X86_LOCAL_APIC) -- cgit v1.2.1 From 47195d57636604ff6048b0d7aa3e4ed9643f6073 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Mon, 22 Feb 2010 18:09:03 -0500 Subject: nmi_watchdog: Clean up various small details Mostly copy/paste whitespace damage with a couple of nitpicks by the checkpatch script. Fix the struct definition as requested by Ingo too. Signed-off-by: Don Zickus Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: aris@redhat.com LKML-Reference: <1266880143-24943-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar -- arch/x86/kernel/apic/hw_nmi.c | 14 +++++------ arch/x86/kernel/traps.c | 6 ++-- include/linux/nmi.h | 2 - kernel/nmi_watchdog.c | 51 ++++++++++++++++++++---------------------- 4 files changed, 36 insertions(+), 37 deletions(-) --- arch/x86/kernel/apic/hw_nmi.c | 14 +++++++------- arch/x86/kernel/traps.c | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 0b4d205a6b8e..e8b78a0be5de 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -38,15 +38,15 @@ static inline unsigned int get_timer_irqs(int cpu) irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; #endif - return irqs; + return irqs; } static inline int mce_in_progress(void) { #if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; + return atomic_read(&mce_entry) > 0; #endif - return 0; + return 0; } int hw_nmi_is_cpu_stuck(struct pt_regs *regs) @@ -69,9 +69,9 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) } /* if we are doing an mce, just assume the cpu is not stuck */ - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - return 0; + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + return 0; /* We determine if the cpu is stuck by checking whether any * interrupts have happened since we last checked. Of course @@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) u64 hw_nmi_get_sample_period(void) { - return cpu_khz * 1000; + return cpu_khz * 1000; } #ifdef ARCH_HAS_NMI_WATCHDOG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 973cbc4f044f..bdc7fab3ef3e 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -402,9 +402,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) return; #ifdef CONFIG_X86_LOCAL_APIC - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; #ifndef CONFIG_NMI_WATCHDOG /* -- cgit v1.2.1 From 58687acba59266735adb8ccd9b5b9aa2c7cd205b Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:44 -0400 Subject: lockup_detector: Combine nmi_watchdog and softlockup detector The new nmi_watchdog (which uses the perf event subsystem) is very similar in structure to the softlockup detector. Using Ingo's suggestion, I combined the two functionalities into one file: kernel/watchdog.c. Now both the nmi_watchdog (or hardlockup detector) and softlockup detector sit on top of the perf event subsystem, which is run every 60 seconds or so to see if there are any lockups. To detect hardlockups, cpus not responding to interrupts, I implemented an hrtimer that runs 5 times for every perf event overflow event. If that stops counting on a cpu, then the cpu is most likely in trouble. To detect softlockups, tasks not yielding to the scheduler, I used the previous kthread idea that now gets kicked every time the hrtimer fires. If the kthread isn't being scheduled neither is anyone else and the warning is printed to the console. I tested this on x86_64 and both the softlockup and hardlockup paths work. V2: - cleaned up the Kconfig and softlockup combination - surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI - seperated out the softlockup case from perf event subsystem - re-arranged the enabling/disabling nmi watchdog from proc space - added cpumasks for hardlockup failure cases - removed fallback to soft events if no PMU exists for hard events V3: - comment cleanups - drop support for older softlockup code - per_cpu cleanups - completely remove software clock base hardlockup detector - use per_cpu masking on hard/soft lockup detection - #ifdef cleanups - rename config option NMI_WATCHDOG to LOCKUP_DETECTOR - documentation additions V4: - documentation fixes - convert per_cpu to __get_cpu_var - powerpc compile fixes V5: - split apart warn flags for hard and soft lockups TODO: - figure out how to make an arch-agnostic clock2cycles call (if possible) to feed into perf events as a sample period [fweisbec: merged conflict patch] Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/Makefile | 4 ++-- arch/x86/kernel/apic/hw_nmi.c | 2 +- arch/x86/kernel/traps.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 1a4512e48d24..52f32e0ea194 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,10 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_NMI_WATCHDOG),y) +ifneq ($(CONFIG_LOCKUP_DETECTOR),y) obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o endif -obj-$(CONFIG_NMI_WATCHDOG) += hw_nmi.o +obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index e8b78a0be5de..79425f96fcee 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) u64 hw_nmi_get_sample_period(void) { - return cpu_khz * 1000; + return (u64)(cpu_khz) * 1000 * 60; } #ifdef ARCH_HAS_NMI_WATCHDOG diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bdc7fab3ef3e..bd347c2b34dc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -406,7 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) == NOTIFY_STOP) return; -#ifndef CONFIG_NMI_WATCHDOG +#ifndef CONFIG_LOCKUP_DETECTOR /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -414,7 +414,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs, reason)) return; if (!do_nmi_callback(regs, cpu)) -#endif /* !CONFIG_NMI_WATCHDOG */ +#endif /* !CONFIG_LOCKUP_DETECTOR */ unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); -- cgit v1.2.1 From 7cbb7e7fa46f6e5229438ac9e4a5c72ec0d53e0b Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:48 -0400 Subject: x86: Move trigger_all_cpu_backtrace to its own die_notifier As part of the transition of the nmi watchdog to something more generic, the trigger_all_cpu_backtrace code is getting left behind. Put it in its own die_notifier so it can still be used. V2: - use arch_spin_locks Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-6-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/hw_nmi.c | 65 +++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 79425f96fcee..8c3edfb89c2b 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -17,6 +17,10 @@ #include #include #include +#include +#include +#include + #include #include @@ -54,20 +58,6 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs) unsigned int sum; int cpu = smp_processor_id(); - /* FIXME: cheap hack for this check, probably should get its own - * die_notifier handler - */ - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ - - spin_lock(&lock); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - dump_stack(); - spin_unlock(&lock); - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - } - /* if we are doing an mce, just assume the cpu is not stuck */ /* Could check oops_in_progress here too, but it's safer not to */ if (mce_in_progress()) @@ -109,6 +99,53 @@ void arch_trigger_all_cpu_backtrace(void) mdelay(1); } } + +static int __kprobes +arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + int cpu = smp_processor_id(); + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + + arch_spin_lock(&lock); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); + dump_stack(); + arch_spin_unlock(&lock); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + return NOTIFY_STOP; + } + + return NOTIFY_DONE; +} + +static __read_mostly struct notifier_block backtrace_notifier = { + .notifier_call = arch_trigger_all_cpu_backtrace_handler, + .next = NULL, + .priority = 1 +}; + +static int __init register_trigger_all_cpu_backtrace(void) +{ + register_die_notifier(&backtrace_notifier); + return 0; +} +early_initcall(register_trigger_all_cpu_backtrace); #endif /* STUB calls to mimic old nmi_watchdog behaviour */ -- cgit v1.2.1 From 10f9014912a2b1cb59c39cdea777e6d9afa8f17e Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 7 May 2010 17:11:49 -0400 Subject: x86: Cleanup hw_nmi.c cruft The design of the hardlockup watchdog has changed and cruft was left behind in the hw_nmi.c file. Just remove the code that isn't used anymore. Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Eric Paris Cc: Randy Dunlap LKML-Reference: <1273266711-18706-7-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/hw_nmi.c | 58 ------------------------------------------- 1 file changed, 58 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 8c3edfb89c2b..3b40082f0371 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -9,74 +9,16 @@ * */ -#include -#include #include -#include -#include -#include -#include -#include #include #include #include - - #include #include /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static DEFINE_PER_CPU(unsigned, last_irq_sum); - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - unsigned int irqs = per_cpu(irq_stat, cpu).irq0_irqs; - -#if defined(CONFIG_X86_LOCAL_APIC) - irqs += per_cpu(irq_stat, cpu).apic_timer_irqs; -#endif - - return irqs; -} - -static inline int mce_in_progress(void) -{ -#if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; -#endif - return 0; -} - -int hw_nmi_is_cpu_stuck(struct pt_regs *regs) -{ - unsigned int sum; - int cpu = smp_processor_id(); - - /* if we are doing an mce, just assume the cpu is not stuck */ - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - return 0; - - /* We determine if the cpu is stuck by checking whether any - * interrupts have happened since we last checked. Of course - * an nmi storm could create false positives, but the higher - * level logic should account for that - */ - sum = get_timer_irqs(cpu); - if (__get_cpu_var(last_irq_sum) == sum) { - return 1; - } else { - __get_cpu_var(last_irq_sum) = sum; - return 0; - } -} - u64 hw_nmi_get_sample_period(void) { return (u64)(cpu_khz) * 1000 * 60; -- cgit v1.2.1 From 5e85391b3badd3f0e50ebdd0cafe0202a979f73a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 May 2010 09:12:39 +0200 Subject: x86, watchdog: Fix build error in hw_nmi.c On some configs the following build error triggers: arch/x86/kernel/apic/hw_nmi.c:35: error: 'apic' undeclared (first use in this function) arch/x86/kernel/apic/hw_nmi.c:35: error: (Each undeclared identifier is reported only once arch/x86/kernel/apic/hw_nmi.c:35: error: for each function it appears in.) Because asm/apic.h was only included implicitly. Include it explicitly. Cc: Frederic Weisbecker Cc: Don Zickus Cc: Peter Zijlstra Cc: Cyrill Gorcunov LKML-Reference: <1273713674-8434-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 3b40082f0371..cefd6942f0e9 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -8,6 +8,7 @@ * Bits copied from original nmi.c file * */ +#include #include #include -- cgit v1.2.1 From cafcd80d216bc2136b8edbb794327e495792c666 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Fri, 14 May 2010 11:11:21 -0400 Subject: lockup_detector: Cross arch compile fixes Combining the softlockup and hardlockup code causes watchdog.c to build even without the hardlockup detection support. So if an arch, that has the previous and the new nmi watchdog implementations cohabiting, wants to know if the generic one is in use, CONFIG_LOCKUP_DETECTOR is not a reliable check. We need to use CONFIG_HARDLOCKUP_DETECTOR instead. Fixes: kernel/built-in.o: In function `touch_nmi_watchdog': (.text+0x449bc): multiple definition of `touch_nmi_watchdog' arch/sparc/kernel/built-in.o:(.text+0x11b28): first defined here Signed-off-by: Don Zickus Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Don Zickus Cc: Cyrill Gorcunov LKML-Reference: <20100514151121.GR15159@redhat.com> [ use CONFIG_HARDLOCKUP_DETECTOR instead of CONFIG_PERF_EVENTS_NMI] Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/apic/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52f32e0ea194..910f20b457c4 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,10 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_LOCKUP_DETECTOR),y) +ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o endif -obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o -- cgit v1.2.1 From 1dedefd1a066a795a87afca9c0236e1a94de9bf6 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 19 May 2010 12:01:23 -0700 Subject: x86: detect scattered cpuid features earlier Some extra CPU features such as ARAT is needed in early boot so that x86_init function pointers can be set up properly. http://lkml.org/lkml/2010/5/18/519 At start_kernel() level, this patch moves init_scattered_cpuid_features() from check_bugs() to setup_arch() -> early_cpu_init() which is earlier than platform specific x86_init layer setup. Suggested by HPA. Signed-off-by: Jacob Pan LKML-Reference: <1274295685-6774-2-git-send-email-jacob.jun.pan@linux.intel.com> Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1c00d0b1692..284bf89ddae2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -576,6 +576,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); + init_scattered_cpuid_features(c); } static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -731,7 +732,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - init_scattered_cpuid_features(c); detect_nopl(c); } -- cgit v1.2.1 From a0c173bd8a3fd0541be8e4ef962170e48d8811c7 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 19 May 2010 12:01:24 -0700 Subject: x86, mrst: add cpu type detection Medfield is the follow-up of Moorestown, it is treated under the same HW sub-architecture. However, we do need to know the CPU type in order for some of the driver to act accordingly. We also have different optimal clock configuration for each CPU type. Signed-off-by: Jacob Pan LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com> Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/mrst.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index e796448f0eb5..ceaebeb5866f 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -27,6 +27,8 @@ static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +static int mrst_cpu_chip; + int sfi_mtimer_num; struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; @@ -216,6 +218,28 @@ static void __init mrst_setup_boot_clock(void) setup_boot_APIC_clock(); }; +int mrst_identify_cpu(void) +{ + return mrst_cpu_chip; +} +EXPORT_SYMBOL_GPL(mrst_identify_cpu); + +void __cpuinit mrst_arch_setup(void) +{ + if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) + mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; + else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) + mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + else { + pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + } + pr_debug("Moorestown CPU %s identified\n", + (mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? + "Lincroft" : "Penwell"); +} + /* * Moorestown specific x86_init function overrides and early setup * calls. @@ -230,6 +254,8 @@ void __init x86_mrst_early_setup(void) x86_init.irqs.pre_vector_init = x86_init_noop; + x86_init.oem.arch_setup = mrst_arch_setup; + x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; x86_platform.calibrate_tsc = mrst_calibrate_tsc; -- cgit v1.2.1 From a875c01944f0d750eeb1ef3133feceb13f13c4b3 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 19 May 2010 12:01:25 -0700 Subject: x86, mrst: add more timer config options Always-on local APIC timer (ARAT) has been introduced to Medfield, along with the platform APB timers we have more timer configuration options between Moorestown and Medfield. This patch adds run-time detection of avaiable timer features so that we can treat Medfield as a variant of Moorestown and set up the optimal timer options for each platform. i.e. Medfield: per cpu always-on local APIC timer Moorestown: per cpu APB timer Manual override is possible via cmdline option x86_mrst_timer. Signed-off-by: Jacob Pan LKML-Reference: <1274295685-6774-4-git-send-email-jacob.jun.pan@linux.intel.com> Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apb_timer.c | 37 +++++-------------- arch/x86/kernel/mrst.c | 88 ++++++++++++++++++++++++++++++++------------- 2 files changed, 71 insertions(+), 54 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index a35347501d36..8dd77800ff5d 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -43,10 +43,11 @@ #include #include +#include #define APBT_MASK CLOCKSOURCE_MASK(32) #define APBT_SHIFT 22 -#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKEVENT_RATING 110 #define APBT_CLOCKSOURCE_RATING 250 #define APBT_MIN_DELTA_USEC 200 @@ -83,8 +84,6 @@ struct apbt_dev { char name[10]; }; -int disable_apbt_percpu __cpuinitdata; - static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); #ifdef CONFIG_SMP @@ -194,29 +193,6 @@ static struct clock_event_device apbt_clockevent = { .rating = APBT_CLOCKEVENT_RATING, }; -/* - * if user does not want to use per CPU apb timer, just give it a lower rating - * than local apic timer and skip the late per cpu timer init. - */ -static inline int __init setup_x86_mrst_timer(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - disable_apbt_percpu = 0; - else if (strcmp("lapic_and_apbt", arg) == 0) - disable_apbt_percpu = 1; - else { - pr_warning("X86 MRST timer option %s not recognised" - " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; -} -__setup("x86_mrst_timer=", setup_x86_mrst_timer); - /* * start count down from 0xffff_ffff. this is done by toggling the enable bit * then load initial load count to ~0. @@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void) adev->num = smp_processor_id(); memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); - if (disable_apbt_percpu) { + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; global_clock_event = &adev->evt; printk(KERN_DEBUG "%s clockevent registered as global\n", @@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n, static __init int apbt_late_init(void) { - if (disable_apbt_percpu || !apb_timer_block_enabled) + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT || + !apb_timer_block_enabled) return 0; /* This notifier should be called after workqueue is ready */ hotcpu_notifier(apbt_cpuhp_notify, -20); @@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode, int timer_num; struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + BUG_ON(!apbt_virt_address); + timer_num = adev->num; pr_debug("%s CPU %d timer %d mode=%d\n", __func__, first_cpu(*evt->cpumask), timer_num, mode); @@ -676,7 +655,7 @@ void __init apbt_time_init(void) } #ifdef CONFIG_SMP /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (disable_apbt_percpu) { + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { printk(KERN_INFO "apbt: disabled per cpu timer\n"); return; } diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index ceaebeb5866f..636b53bd4198 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -25,6 +25,29 @@ #include #include +/* + * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, + * cmdline option x86_mrst_timer can be used to override the configuration + * to prefer one or the other. + * at runtime, there are basically three timer configurations: + * 1. per cpu apbt clock only + * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only + * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. + * + * by default (without cmdline option), platform code first detects cpu type + * to see if we are on lincroft or penwell, then set up both lapic or apbt + * clocks accordingly. + * i.e. by default, medfield uses configuration #2, moorestown uses #1. + * config #3 is supported but not recommended on medfield. + * + * rating and feature summary: + * lapic (with C3STOP) --------- 100 + * apbt (always-on) ------------ 110 + * lapic (always-on,ARAT) ------ 150 + */ + +int mrst_timer_options __cpuinitdata; + static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; static int mrst_cpu_chip; @@ -169,18 +192,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) return 0; } -/* - * the secondary clock in Moorestown can be APBT or LAPIC clock, default to - * APBT but cmdline option can also override it. - */ -static void __cpuinit mrst_setup_secondary_clock(void) -{ - /* restore default lapic clock if disabled by cmdline */ - if (disable_apbt_percpu) - return setup_secondary_APIC_clock(); - apbt_setup_secondary_clock(); -} - static unsigned long __init mrst_calibrate_tsc(void) { unsigned long flags, fast_calibrate; @@ -197,6 +208,21 @@ static unsigned long __init mrst_calibrate_tsc(void) void __init mrst_time_init(void) { + switch (mrst_timer_options) { + case MRST_TIMER_APBT_ONLY: + break; + case MRST_TIMER_LAPIC_APBT: + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + break; + default: + if (!boot_cpu_has(X86_FEATURE_ARAT)) + break; + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + return; + } + /* we need at least one APB timer */ sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); pre_init_apic_IRQ0(); apbt_time_init(); @@ -207,17 +233,6 @@ void __init mrst_rtc_init(void) sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); } -/* - * if we use per cpu apb timer, the bootclock already setup. if we use lapic - * timer and one apbt timer for broadcast, we need to set up lapic boot clock. - */ -static void __init mrst_setup_boot_clock(void) -{ - pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); - if (disable_apbt_percpu) - setup_boot_APIC_clock(); -}; - int mrst_identify_cpu(void) { return mrst_cpu_chip; @@ -250,13 +265,13 @@ void __init x86_mrst_early_setup(void) x86_init.resources.reserve_resources = x86_init_noop; x86_init.timers.timer_init = mrst_time_init; - x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; + x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.oem.arch_setup = mrst_arch_setup; - x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; + x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_init.pci.init = pci_mrst_init; @@ -269,3 +284,26 @@ void __init x86_mrst_early_setup(void) x86_init.mpparse.get_smp_config = x86_init_uint_noop; } + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_mrst_timer(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + mrst_timer_options = MRST_TIMER_APBT_ONLY; + else if (strcmp("lapic_and_apbt", arg) == 0) + mrst_timer_options = MRST_TIMER_LAPIC_APBT; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; +} +__setup("x86_mrst_timer=", setup_x86_mrst_timer); -- cgit v1.2.1 From a75af580bb1fd261bf63cc00e4b324e17ceb15cf Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 19 May 2010 13:40:14 -0700 Subject: x86, mrst: make mrst_identify_cpu() an inline returning enum We have an enum, might as well use it. While we're at it, make it an inline... there is really no point in calling a function for this stuff. LKML-Reference: <1274295685-6774-3-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin Acked-by: Thomas Gleixner Cc: Jacob Pan --- arch/x86/kernel/mrst.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 636b53bd4198..967f2686adb0 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -50,7 +50,8 @@ int mrst_timer_options __cpuinitdata; static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; -static int mrst_cpu_chip; +enum mrst_cpu_type __mrst_cpu_chip; +EXPORT_SYMBOL_GPL(__mrst_cpu_chip); int sfi_mtimer_num; @@ -233,25 +234,19 @@ void __init mrst_rtc_init(void) sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); } -int mrst_identify_cpu(void) -{ - return mrst_cpu_chip; -} -EXPORT_SYMBOL_GPL(mrst_identify_cpu); - void __cpuinit mrst_arch_setup(void) { if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) - mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; + __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) - mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; else { pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", boot_cpu_data.x86, boot_cpu_data.x86_model); - mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; } pr_debug("Moorestown CPU %s identified\n", - (mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? + (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? "Lincroft" : "Penwell"); } -- cgit v1.2.1 From 14671386dcbafb3086bbda3cb6f9f27d34c7bf6d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 19 May 2010 14:37:40 -0700 Subject: x86, mrst: make mrst_timer_options an enum We have an enum mrst_timer_options, use it so that the kernel knows if we're missing something from a switch statement or equivalent. Signed-off-by: H. Peter Anvin LKML-Reference: <1274295685-6774-4-git-send-email-jacob.jun.pan@linux.intel.com> Cc: Thomas Gleixner Cc: Jacob Pan --- arch/x86/kernel/mrst.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 967f2686adb0..7ee4ed901baf 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -46,7 +46,7 @@ * lapic (always-on,ARAT) ------ 150 */ -int mrst_timer_options __cpuinitdata; +__cpuinitdata enum mrst_timer_options mrst_timer_options; static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; -- cgit v1.2.1 From 12a6611fa16e9c6d2f844fe2175d219c6e9bd95d Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:01 -0500 Subject: x86, UV: Calculate BAU destination timeout Calculate the Broadcast Assist Unit's destination timeout period from the values in the relevant MMR's. Store it in each cpu's per-cpu BAU structure so that a destination timeout can be differentiated from a 'plugged' situation in which all software ack resources are already allocated and a timeout is pending. That case returns an immediate destination error. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 51 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 7fea555929e2..5506836c4a82 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -30,6 +30,19 @@ struct msg_desc { struct bau_payload_queue_entry *va_queue_last; }; +/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ +static int timeout_base_ns[] = { + 20, + 160, + 1280, + 10240, + 81920, + 655360, + 5242880, + 167772160 +}; +static int timeout_us; + #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL static int uv_bau_max_concurrent __read_mostly; @@ -423,7 +436,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc, * pending. In that case hardware returns the * ERROR that looks like a destination timeout. */ - if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) { + if (cycles_2_us(ttime - bcp->send_message) < + timeout_us) { bcp->conseccompletes = 0; return FLUSH_RETRY_PLUGGED; } @@ -908,12 +922,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data) } static inline unsigned long long -millisec_2_cycles(unsigned long millisec) +microsec_2_cycles(unsigned long microsec) { unsigned long ns; unsigned long long cyc; - ns = millisec * 1000; + ns = microsec * 1000; cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); return cyc; } @@ -1258,6 +1272,33 @@ static void __init uv_init_uvhub(int uvhub, int vector) ((apicid << 32) | vector)); } +/* + * We will set BAU_MISC_CONTROL with a timeout period. + * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. + * So the destination timeout period has be be calculated from them. + */ +static int +calculate_destination_timeout(void) +{ + unsigned long mmr_image; + int mult1; + int mult2; + int index; + int base; + int ret; + unsigned long ts_ns; + + mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; + mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); + index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; + mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); + mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; + base = timeout_base_ns[index]; + ts_ns = base * mult1 * mult2; + ret = ts_ns / 1000; + return ret; +} + /* * initialize the bau_control structure for each cpu */ @@ -1286,6 +1327,8 @@ static void uv_init_per_cpu(int nuvhubs) }; struct uvhub_desc *uvhub_descs; + timeout_us = calculate_destination_timeout(); + uvhub_descs = (struct uvhub_desc *) kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); @@ -1301,7 +1344,7 @@ static void uv_init_per_cpu(int nuvhubs) bdp->uvhub = uvhub; bdp->pnode = pnode; /* time interval to catch a hardware stay-busy bug */ - bcp->timeout_interval = millisec_2_cycles(3); + bcp->timeout_interval = microsec_2_cycles(2*timeout_us); /* kludge: assume uv_hub.h is constant */ socket = (cpu_physical_id(cpu)>>5)&1; if (socket >= bdp->num_sockets) -- cgit v1.2.1 From e8e5e8a8048006a12d7777a93baebd6e39496101 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:01 -0500 Subject: x86, UV: BAU tunables into a debugfs file Make the Broadcast Assist Unit driver's nine tuning values variable by making them accessible through a read/write debugfs file. The file will normally be mounted as /sys/kernel/debug/sgi_uv/bau_tunables. The tunables are kept in each cpu's per-cpu BAU structure. The patch also does a little name improvement, and corrects the reset of two destination timeout counters. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 281 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 241 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 5506836c4a82..c8661779c51e 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include @@ -42,12 +43,22 @@ static int timeout_base_ns[] = { 167772160 }; static int timeout_us; +static int nobau; -#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL - -static int uv_bau_max_concurrent __read_mostly; +/* tunables: */ +static int max_bau_concurrent = MAX_BAU_CONCURRENT; +static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT; +static int plugged_delay = PLUGGED_DELAY; +static int plugsb4reset = PLUGSB4RESET; +static int timeoutsb4reset = TIMEOUTSB4RESET; +static int ipi_reset_limit = IPI_RESET_LIMIT; +static int complete_threshold = COMPLETE_THRESHOLD; +static int congested_response_us = CONGESTED_RESPONSE_US; +static int congested_reps = CONGESTED_REPS; +static int congested_period = CONGESTED_PERIOD; +static struct dentry *tunables_dir; +static struct dentry *tunables_file; -static int nobau; static int __init setup_nobau(char *arg) { nobau = 1; @@ -539,23 +550,24 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, unsigned long index; cycles_t time1; cycles_t time2; + cycles_t elapsed; struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); struct bau_control *smaster = bcp->socket_master; struct bau_control *hmaster = bcp->uvhub_master; /* - * Spin here while there are hmaster->max_concurrent or more active + * Spin here while there are hmaster->max_bau_concurrent or more active * descriptors. This is the per-uvhub 'throttle'. */ if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, &hmaster->active_descriptor_count, - hmaster->max_concurrent)) { + hmaster->max_bau_concurrent)) { stat->s_throttles++; do { cpu_relax(); } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, &hmaster->active_descriptor_count, - hmaster->max_concurrent)); + hmaster->max_bau_concurrent)); } while (hmaster->uvhub_quiesce) @@ -609,9 +621,9 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, * that case hardware immediately returns the ERROR * that looks like a destination timeout. */ - udelay(TIMEOUT_DELAY); + udelay(bcp->plugged_delay); bcp->plugged_tries++; - if (bcp->plugged_tries >= PLUGSB4RESET) { + if (bcp->plugged_tries >= bcp->plugsb4reset) { bcp->plugged_tries = 0; quiesce_local_uvhub(hmaster); spin_lock(&hmaster->queue_lock); @@ -623,10 +635,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, stat->s_resets_plug++; } } else if (completion_status == FLUSH_RETRY_TIMEOUT) { - hmaster->max_concurrent = 1; + hmaster->max_bau_concurrent = 1; bcp->timeout_tries++; udelay(TIMEOUT_DELAY); - if (bcp->timeout_tries >= TIMEOUTSB4RESET) { + if (bcp->timeout_tries >= bcp->timeoutsb4reset) { bcp->timeout_tries = 0; quiesce_local_uvhub(hmaster); spin_lock(&hmaster->queue_lock); @@ -638,7 +650,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, stat->s_resets_timeout++; } } - if (bcp->ipi_attempts >= 3) { + if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { bcp->ipi_attempts = 0; completion_status = FLUSH_GIVEUP; break; @@ -648,9 +660,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, (completion_status == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); - if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) - && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) - hmaster->max_concurrent++; + bcp->plugged_tries = 0; + bcp->timeout_tries = 0; + + if ((completion_status == FLUSH_COMPLETE) && + (bcp->conseccompletes > bcp->complete_threshold) && + (hmaster->max_bau_concurrent < + hmaster->max_bau_concurrent_constant)) + hmaster->max_bau_concurrent++; /* * hold any cpu not timing out here; no other cpu currently held by @@ -661,9 +678,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, atomic_dec(&hmaster->active_descriptor_count); /* guard against cycles wrap */ - if (time2 > time1) - stat->s_time += (time2 - time1); - else + if (time2 > time1) { + elapsed = time2 - time1; + stat->s_time += elapsed; + } else stat->s_requestor--; /* don't count this one */ if (completion_status == FLUSH_COMPLETE && try > 1) stat->s_retriesok++; @@ -730,10 +748,12 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct ptc_stats *stat; struct bau_control *bcp; + /* kernel was booted 'nobau' */ if (nobau) return cpumask; bcp = &per_cpu(bau_control, cpu); + /* * Each sending cpu has a per-cpu mask which it fills from the caller's * cpu mask. Only remote cpus are converted to uvhubs and copied. @@ -970,6 +990,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) stat->s_resets_plug, stat->s_resets_timeout, stat->s_giveup, stat->s_stimeout, stat->s_busy, stat->s_throttles); + /* destination side statistics */ seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", @@ -985,10 +1006,29 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) return 0; } +/* + * Display the tunables thru debugfs + */ +static ssize_t tunables_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[300]; + int ret; + + ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", + "max_bau_concurrent plugged_delay plugsb4reset", + "timeoutsb4reset ipi_reset_limit complete_threshold", + "congested_response_us congested_reps congested_period", + max_bau_concurrent, plugged_delay, plugsb4reset, + timeoutsb4reset, ipi_reset_limit, complete_threshold, + congested_response_us, congested_reps, congested_period); + + return simple_read_from_buffer(userbuf, count, ppos, buf, ret); +} + /* * -1: resetf the statistics * 0: display meaning of the statistics - * >0: maximum concurrent active descriptors per uvhub (throttle) */ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, size_t count, loff_t *data) @@ -997,7 +1037,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, long input_arg; char optstr[64]; struct ptc_stats *stat; - struct bau_control *bcp; if (count == 0 || count > sizeof(optstr)) return -EINVAL; @@ -1078,24 +1117,149 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, stat = &per_cpu(ptcstats, cpu); memset(stat, 0, sizeof(struct ptc_stats)); } - } else { - uv_bau_max_concurrent = input_arg; - bcp = &per_cpu(bau_control, smp_processor_id()); - if (uv_bau_max_concurrent < 1 || - uv_bau_max_concurrent > bcp->cpus_in_uvhub) { - printk(KERN_DEBUG - "Error: BAU max concurrent %d; %d is invalid\n", - bcp->max_concurrent, uv_bau_max_concurrent); - return -EINVAL; - } - printk(KERN_DEBUG "Set BAU max concurrent:%d\n", - uv_bau_max_concurrent); - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->max_concurrent = uv_bau_max_concurrent; + } + + return count; +} + +static int local_atoi(const char *name) +{ + int val = 0; + + for (;; name++) { + switch (*name) { + case '0' ... '9': + val = 10*val+(*name-'0'); + break; + default: + return val; } } +} + +/* + * set the tunables + * 0 values reset them to defaults + */ +static ssize_t tunables_write(struct file *file, const char __user *user, + size_t count, loff_t *data) +{ + int cpu; + int cnt = 0; + int val; + char *p; + char *q; + char instr[64]; + struct bau_control *bcp; + if (count == 0 || count > sizeof(instr)-1) + return -EINVAL; + if (copy_from_user(instr, user, count)) + return -EFAULT; + + instr[count] = '\0'; + /* count the fields */ + p = instr + strspn(instr, WHITESPACE); + q = p; + for (; *p; p = q + strspn(q, WHITESPACE)) { + q = p + strcspn(p, WHITESPACE); + cnt++; + if (q == p) + break; + } + if (cnt != 9) { + printk(KERN_INFO "bau tunable error: should be 9 numbers\n"); + return -EINVAL; + } + + p = instr + strspn(instr, WHITESPACE); + q = p; + for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) { + q = p + strcspn(p, WHITESPACE); + val = local_atoi(p); + switch (cnt) { + case 0: + if (val == 0) { + max_bau_concurrent = MAX_BAU_CONCURRENT; + max_bau_concurrent_constant = + MAX_BAU_CONCURRENT; + continue; + } + bcp = &per_cpu(bau_control, smp_processor_id()); + if (val < 1 || val > bcp->cpus_in_uvhub) { + printk(KERN_DEBUG + "Error: BAU max concurrent %d is invalid\n", + val); + return -EINVAL; + } + max_bau_concurrent = val; + max_bau_concurrent_constant = val; + continue; + case 1: + if (val == 0) + plugged_delay = PLUGGED_DELAY; + else + plugged_delay = val; + continue; + case 2: + if (val == 0) + plugsb4reset = PLUGSB4RESET; + else + plugsb4reset = val; + continue; + case 3: + if (val == 0) + timeoutsb4reset = TIMEOUTSB4RESET; + else + timeoutsb4reset = val; + continue; + case 4: + if (val == 0) + ipi_reset_limit = IPI_RESET_LIMIT; + else + ipi_reset_limit = val; + continue; + case 5: + if (val == 0) + complete_threshold = COMPLETE_THRESHOLD; + else + complete_threshold = val; + continue; + case 6: + if (val == 0) + congested_response_us = CONGESTED_RESPONSE_US; + else + congested_response_us = val; + continue; + case 7: + if (val == 0) + congested_reps = CONGESTED_REPS; + else + congested_reps = val; + continue; + case 8: + if (val == 0) + congested_period = CONGESTED_PERIOD; + else + congested_period = val; + continue; + } + if (q == p) + break; + } + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->max_bau_concurrent = max_bau_concurrent; + bcp->max_bau_concurrent_constant = max_bau_concurrent; + bcp->plugged_delay = plugged_delay; + bcp->plugsb4reset = plugsb4reset; + bcp->timeoutsb4reset = timeoutsb4reset; + bcp->ipi_reset_limit = ipi_reset_limit; + bcp->complete_threshold = complete_threshold; + bcp->congested_response_us = congested_response_us; + bcp->congested_reps = congested_reps; + bcp->congested_period = congested_period; + } return count; } @@ -1111,6 +1275,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file) return seq_open(file, &uv_ptc_seq_ops); } +static int tunables_open(struct inode *inode, struct file *file) +{ + return 0; +} + static const struct file_operations proc_uv_ptc_operations = { .open = uv_ptc_proc_open, .read = seq_read, @@ -1119,6 +1288,12 @@ static const struct file_operations proc_uv_ptc_operations = { .release = seq_release, }; +static const struct file_operations tunables_fops = { + .open = tunables_open, + .read = tunables_read, + .write = tunables_write, +}; + static int __init uv_ptc_init(void) { struct proc_dir_entry *proc_uv_ptc; @@ -1133,6 +1308,20 @@ static int __init uv_ptc_init(void) UV_PTC_BASENAME); return -EINVAL; } + + tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); + if (!tunables_dir) { + printk(KERN_ERR "unable to create debugfs directory %s\n", + UV_BAU_TUNABLES_DIR); + return -EINVAL; + } + tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, + tunables_dir, NULL, &tunables_fops); + if (!tunables_file) { + printk(KERN_ERR "unable to create debugfs file %s\n", + UV_BAU_TUNABLES_FILE); + return -EINVAL; + } return 0; } @@ -1336,15 +1525,12 @@ static void uv_init_per_cpu(int nuvhubs) bcp = &per_cpu(bau_control, cpu); memset(bcp, 0, sizeof(struct bau_control)); spin_lock_init(&bcp->masks_lock); - bcp->max_concurrent = uv_bau_max_concurrent; pnode = uv_cpu_hub_info(cpu)->pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; bdp = &uvhub_descs[uvhub]; bdp->num_cpus++; bdp->uvhub = uvhub; bdp->pnode = pnode; - /* time interval to catch a hardware stay-busy bug */ - bcp->timeout_interval = microsec_2_cycles(2*timeout_us); /* kludge: assume uv_hub.h is constant */ socket = (cpu_physical_id(cpu)>>5)&1; if (socket >= bdp->num_sockets) @@ -1380,6 +1566,21 @@ static void uv_init_per_cpu(int nuvhubs) } } kfree(uvhub_descs); + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + /* time interval to catch a hardware stay-busy bug */ + bcp->timeout_interval = microsec_2_cycles(2*timeout_us); + bcp->max_bau_concurrent = max_bau_concurrent; + bcp->max_bau_concurrent_constant = max_bau_concurrent; + bcp->plugged_delay = plugged_delay; + bcp->plugsb4reset = plugsb4reset; + bcp->timeoutsb4reset = timeoutsb4reset; + bcp->ipi_reset_limit = ipi_reset_limit; + bcp->complete_threshold = complete_threshold; + bcp->congested_response_us = congested_response_us; + bcp->congested_reps = congested_reps; + bcp->congested_period = congested_period; + } } /* @@ -1404,7 +1605,7 @@ static int __init uv_bau_init(void) zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), GFP_KERNEL, cpu_to_node(cur_cpu)); - uv_bau_max_concurrent = MAX_BAU_CONCURRENT; + max_bau_concurrent = MAX_BAU_CONCURRENT; uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); @@ -1437,4 +1638,4 @@ static int __init uv_bau_init(void) return 0; } core_initcall(uv_bau_init); -core_initcall(uv_ptc_init); +fs_initcall(uv_ptc_init); -- cgit v1.2.1 From 50fb55acc5bbe5ee29d0a65262f4ec286b14d156 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Disable BAU on network congestion The numalink network can become so congested that TLB shootdown using the Broadcast Assist Unit becomes slower than using IPI's. In that case, disable the use of the BAU for a period of time. The period is tunable. When the period expires the use of the BAU is re-enabled. A count of these actions is added to the statistics file. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index c8661779c51e..dc6a68312758 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -44,6 +44,9 @@ static int timeout_base_ns[] = { }; static int timeout_us; static int nobau; +static int baudisabled; +static spinlock_t disable_lock; +static cycles_t congested_cycles; /* tunables: */ static int max_bau_concurrent = MAX_BAU_CONCURRENT; @@ -519,6 +522,35 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) return 1; } +/* + * Completions are taking a very long time due to a congested numalink + * network. + */ +static void +disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) +{ + int tcpu; + struct bau_control *tbcp; + + /* let only one cpu do this disabling */ + spin_lock(&disable_lock); + if (!baudisabled && bcp->period_requests && + ((bcp->period_time / bcp->period_requests) > congested_cycles)) { + /* it becomes this cpu's job to turn on the use of the + BAU again */ + baudisabled = 1; + bcp->set_bau_off = 1; + bcp->set_bau_on_time = get_cycles() + + sec_2_cycles(bcp->congested_period); + stat->s_bau_disabled++; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + tbcp->baudisabled = 1; + } + } + spin_unlock(&disable_lock); +} + /** * uv_flush_send_and_wait * @@ -681,6 +713,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, if (time2 > time1) { elapsed = time2 - time1; stat->s_time += elapsed; + if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { + bcp->period_requests++; + bcp->period_time += elapsed; + if ((elapsed > congested_cycles) && + (bcp->period_requests > bcp->congested_reps)) { + disable_for_congestion(bcp, stat); + } + } } else stat->s_requestor--; /* don't count this one */ if (completion_status == FLUSH_COMPLETE && try > 1) @@ -747,12 +787,32 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; + struct bau_control *tbcp; /* kernel was booted 'nobau' */ if (nobau) return cpumask; bcp = &per_cpu(bau_control, cpu); + stat = &per_cpu(ptcstats, cpu); + + /* bau was disabled due to slow response */ + if (bcp->baudisabled) { + /* the cpu that disabled it must re-enable it */ + if (bcp->set_bau_off) { + if (get_cycles() >= bcp->set_bau_on_time) { + stat->s_bau_reenabled++; + baudisabled = 0; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + tbcp->baudisabled = 0; + tbcp->period_requests = 0; + tbcp->period_time = 0; + } + } + } + return cpumask; + } /* * Each sending cpu has a per-cpu mask which it fills from the caller's @@ -793,7 +853,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, else return NULL; } - stat = &per_cpu(ptcstats, cpu); stat->s_requestor++; stat->s_ntargcpu += remotes; remotes = bau_uvhub_weight(&bau_desc->distribution); @@ -973,7 +1032,9 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "sw_ack recv rtime all "); seq_printf(file, - "one mult none retry canc nocan reset rcan\n"); + "one mult none retry canc nocan reset rcan "); + seq_printf(file, + "disable enable\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); @@ -993,7 +1054,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) /* destination side statistics */ seq_printf(file, - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", uv_read_global_mmr64(uv_cpu_to_pnode(cpu), UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), stat->d_requestee, cycles_2_us(stat->d_time), @@ -1001,6 +1062,8 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); + seq_printf(file, "%ld %ld\n", + stat->s_bau_disabled, stat->s_bau_reenabled); } return 0; @@ -1112,6 +1175,10 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, "reset: number of ipi-style reset requests processed\n"); printk(KERN_DEBUG "rcan: number messages canceled by reset requests\n"); + printk(KERN_DEBUG + "disable: number times use of the BAU was disabled\n"); + printk(KERN_DEBUG + "enable: number times use of the BAU was re-enabled\n"); } else if (input_arg == -1) { for_each_present_cpu(cpu) { stat = &per_cpu(ptcstats, cpu); @@ -1568,6 +1635,7 @@ static void uv_init_per_cpu(int nuvhubs) kfree(uvhub_descs); for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); + bcp->baudisabled = 0; /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = microsec_2_cycles(2*timeout_us); bcp->max_bau_concurrent = max_bau_concurrent; @@ -1609,6 +1677,8 @@ static int __init uv_bau_init(void) uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); + spin_lock_init(&disable_lock); + congested_cycles = microsec_2_cycles(congested_response_us); uv_init_per_cpu(nuvhubs); -- cgit v1.2.1 From 712157aa703a01f58c7c17452096ab00b774d0a9 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Shorten access to BAU statistics structure Use a pointer from the per-cpu BAU control structure to the per-cpu BAU statistics structure. We nearly always know the first before needing the second. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index dc6a68312758..261b9653cde5 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -153,7 +153,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, struct ptc_stats *stat; msg = mdp->msg; - stat = &per_cpu(ptcstats, bcp->cpu); + stat = bcp->statp; stat->d_retries++; /* * cancel any message from msg+1 to the retry itself @@ -217,7 +217,7 @@ static void uv_bau_process_message(struct msg_desc *mdp, * This must be a normal message, or retry of a normal message */ msg = mdp->msg; - stat = &per_cpu(ptcstats, bcp->cpu); + stat = bcp->statp; if (msg->address == TLB_FLUSH_ALL) { local_flush_tlb(); stat->d_alltlb++; @@ -301,7 +301,7 @@ uv_do_reset(void *ptr) bcp = &per_cpu(bau_control, smp_processor_id()); rap = (struct reset_args *)ptr; - stat = &per_cpu(ptcstats, bcp->cpu); + stat = bcp->statp; stat->d_resets++; /* @@ -419,7 +419,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc, unsigned long mask; cycles_t ttime; cycles_t timeout_time; - struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu); + struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster; hmaster = bcp->uvhub_master; @@ -583,7 +583,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, cycles_t time1; cycles_t time2; cycles_t elapsed; - struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); + struct ptc_stats *stat = bcp->statp; struct bau_control *smaster = bcp->socket_master; struct bau_control *hmaster = bcp->uvhub_master; @@ -794,7 +794,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, return cpumask; bcp = &per_cpu(bau_control, cpu); - stat = &per_cpu(ptcstats, cpu); + stat = bcp->statp; /* bau was disabled due to slow response */ if (bcp->baudisabled) { @@ -903,7 +903,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) time_start = get_cycles(); bcp = &per_cpu(bau_control, smp_processor_id()); - stat = &per_cpu(ptcstats, smp_processor_id()); + stat = bcp->statp; msgdesc.va_queue_first = bcp->va_queue_first; msgdesc.va_queue_last = bcp->va_queue_last; msg = bcp->bau_msg_head; @@ -1636,6 +1636,7 @@ static void uv_init_per_cpu(int nuvhubs) for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; + bcp->statp = &per_cpu(ptcstats, cpu); /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = microsec_2_cycles(2*timeout_us); bcp->max_bau_concurrent = max_bau_concurrent; @@ -1673,7 +1674,6 @@ static int __init uv_bau_init(void) zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), GFP_KERNEL, cpu_to_node(cur_cpu)); - max_bau_concurrent = MAX_BAU_CONCURRENT; uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); -- cgit v1.2.1 From 4faca1550838708d71f6eea14cdacb0876c3a5a4 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: BAU structure rearranging Move some structure definitions from the C code to the BAU header file, and change the organization of that header file a little. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 261b9653cde5..d75929039846 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -23,14 +23,6 @@ #include #include -struct msg_desc { - struct bau_payload_queue_entry *msg; - int msg_slot; - int sw_ack_slot; - struct bau_payload_queue_entry *va_queue_first; - struct bau_payload_queue_entry *va_queue_last; -}; - /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ static int timeout_base_ns[] = { 20, @@ -79,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); -struct reset_args { - int sender; -}; - /* * Determine the first node on a uvhub. 'Nodes' are used for kernel * memory allocation. -- cgit v1.2.1 From 39847e7f3c8198b14102fe7ba4b3a6a1d84bbcfe Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Correct BAU software acknowledge Correct the acknowledgment and the reset of a BAU software-acknowledged message. A retry message should be testing only for timed-out resources (mask << 8). (And we delete a log message that might cause unnecessary concern) The acknowledge MMR is |--timed-out--|---pending--|, each is 8 bits. The IPI-driven reset of software acknowledge resources frees both timed out and pending resources. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index d75929039846..295a41122da1 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -161,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, slot2 = msg2 - mdp->va_queue_first; mmr = uv_read_local_mmr (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = ((msg2->sw_ack_vector << 8) | - msg2->sw_ack_vector); + msg_res = msg2->sw_ack_vector; /* * This is a message retry; clear the resources held * by the previous message only if they timed out. * If it has not timed out we have an unexpected * situation to report. */ - if (mmr & (msg_res << 8)) { + if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { /* * is the resource timed out? * make everyone ignore the cancelled message. @@ -179,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, cancel_count++; uv_write_local_mmr( UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - (msg_res << 8) | msg_res); - } else - printk(KERN_INFO "note bau retry: no effect\n"); + (msg_res << UV_SW_ACK_NPENDING) | + msg_res); + } } } if (!cancel_count) @@ -317,13 +316,13 @@ uv_do_reset(void *ptr) */ mmr = uv_read_local_mmr (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = ((msg->sw_ack_vector << 8) | - msg->sw_ack_vector); + msg_res = msg->sw_ack_vector; if (mmr & msg_res) { stat->d_rcanceled++; uv_write_local_mmr( UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - msg_res); + (msg_res << UV_SW_ACK_NPENDING) | + msg_res); } } } -- cgit v1.2.1 From a8328ee58c15c9d763a67607a35bb987b38950fa Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Correct BAU discovery of hubs and sockets Correct the initialization-time assumption of contigous blade numbers and of sockets numbered from zero. There may be hubs present with no cpu's enabled. There may be disabled sockets such that the active socket is not number zero. And assign a 'socket master' by assuming that a socket is a node. (it is not safe to extract socket number from an apicid) Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 49 ++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 295a41122da1..ab929e976502 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -1547,11 +1547,13 @@ calculate_destination_timeout(void) */ static void uv_init_per_cpu(int nuvhubs) { - int i, j, k; + int i; int cpu; int pnode; int uvhub; short socket = 0; + unsigned short socket_mask; + unsigned int uvhub_mask; struct bau_control *bcp; struct uvhub_desc *bdp; struct socket_desc *sdp; @@ -1562,7 +1564,7 @@ static void uv_init_per_cpu(int nuvhubs) short cpu_number[16]; }; struct uvhub_desc { - short num_sockets; + unsigned short socket_mask; short num_cpus; short uvhub; short pnode; @@ -1581,43 +1583,54 @@ static void uv_init_per_cpu(int nuvhubs) spin_lock_init(&bcp->masks_lock); pnode = uv_cpu_hub_info(cpu)->pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; + uvhub_mask |= (1 << uvhub); bdp = &uvhub_descs[uvhub]; bdp->num_cpus++; bdp->uvhub = uvhub; bdp->pnode = pnode; - /* kludge: assume uv_hub.h is constant */ - socket = (cpu_physical_id(cpu)>>5)&1; - if (socket >= bdp->num_sockets) - bdp->num_sockets = socket+1; + /* kludge: 'assuming' one node per socket, and assuming that + disabling a socket just leaves a gap in node numbers */ + socket = (cpu_to_node(cpu) & 1);; + bdp->socket_mask |= (1 << socket); sdp = &bdp->socket[socket]; sdp->cpu_number[sdp->num_cpus] = cpu; sdp->num_cpus++; } - socket = 0; - for_each_possible_blade(uvhub) { + uvhub = 0; + while (uvhub_mask) { + if (!(uvhub_mask & 1)) + goto nexthub; bdp = &uvhub_descs[uvhub]; - for (i = 0; i < bdp->num_sockets; i++) { - sdp = &bdp->socket[i]; - for (j = 0; j < sdp->num_cpus; j++) { - cpu = sdp->cpu_number[j]; + socket_mask = bdp->socket_mask; + socket = 0; + while (socket_mask) { + if (!(socket_mask & 1)) + goto nextsocket; + sdp = &bdp->socket[socket]; + for (i = 0; i < sdp->num_cpus; i++) { + cpu = sdp->cpu_number[i]; bcp = &per_cpu(bau_control, cpu); bcp->cpu = cpu; - if (j == 0) { + if (i == 0) { smaster = bcp; - if (i == 0) + if (socket == 0) hmaster = bcp; } bcp->cpus_in_uvhub = bdp->num_cpus; bcp->cpus_in_socket = sdp->num_cpus; bcp->socket_master = smaster; + bcp->uvhub = bdp->uvhub; bcp->uvhub_master = hmaster; - for (k = 0; k < DEST_Q_SIZE; k++) - bcp->socket_acknowledge_count[k] = 0; - bcp->uvhub_cpu = - uv_cpu_hub_info(cpu)->blade_processor_id; + bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> + blade_processor_id; } +nextsocket: socket++; + socket_mask = (socket_mask >> 1); } +nexthub: + uvhub++; + uvhub_mask = (uvhub_mask >> 1); } kfree(uvhub_descs); for_each_present_cpu(cpu) { -- cgit v1.2.1 From 90cc7d944981a6d06b49bb26fde1b490e28c90e5 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Remove BAU check for stay-busy Remove a faulty assumption that a long running BAU request has encountered a hardware problem and will never finish. Numalink congestion can make a request appear to have encountered such a problem, but it is not safe to cancel the request. If such a cancel is done but a reply is later received we can miss a TLB shootdown. We depend upon the max_bau_concurrent 'throttle' to prevent the stay-busy case from happening. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index ab929e976502..dc962b5ac870 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -405,12 +405,10 @@ static int uv_wait_completion(struct bau_desc *bau_desc, unsigned long mmr; unsigned long mask; cycles_t ttime; - cycles_t timeout_time; struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster; hmaster = bcp->uvhub_master; - timeout_time = get_cycles() + bcp->timeout_interval; /* spin on the status MMR, waiting for it to go idle */ while ((descriptor_status = (((unsigned long) @@ -450,26 +448,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc, * descriptor_status is still BUSY */ cpu_relax(); - relaxes++; - if (relaxes >= 10000) { - relaxes = 0; - if (get_cycles() > timeout_time) { - quiesce_local_uvhub(hmaster); - - /* single-thread the register change */ - spin_lock(&hmaster->masks_lock); - mmr = uv_read_local_mmr(mmr_offset); - mask = 0UL; - mask |= (3UL < right_shift); - mask = ~mask; - mmr &= mask; - uv_write_local_mmr(mmr_offset, mmr); - spin_unlock(&hmaster->masks_lock); - end_uvhub_quiesce(hmaster); - stat->s_busy++; - return FLUSH_GIVEUP; - } - } } } bcp->conseccompletes++; @@ -1580,7 +1558,6 @@ static void uv_init_per_cpu(int nuvhubs) for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); memset(bcp, 0, sizeof(struct bau_control)); - spin_lock_init(&bcp->masks_lock); pnode = uv_cpu_hub_info(cpu)->pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; uvhub_mask |= (1 << uvhub); -- cgit v1.2.1 From 7fba1bcd4844a4a8619a03bf51cabc92aea365a8 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Correct BAU regular message type The Broadcast Assist Unit messages have a regular or retry message type. The regular type was not being set, but needs to be, because the lack of a message type is sometimes used to identify an unused entry in the message queue. Also removing some excess comments. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index dc962b5ac870..4cb14dbd7fa3 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -580,23 +580,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, } time1 = get_cycles(); do { - /* - * Every message from any given cpu gets a unique message - * sequence number. But retries use that same number. - * Our message may have timed out at the destination because - * all sw-ack resources are in use and there is a timeout - * pending there. In that case, our last send never got - * placed into the queue and we need to persist until it - * does. - * - * Make any retry a type MSG_RETRY so that the destination will - * free any resource held by a previous message from this cpu. - */ if (try == 0) { - /* use message type set by the caller the first time */ + bau_desc->header.msg_type = MSG_REGULAR; seq_number = bcp->message_number++; } else { - /* use RETRY type on all the rest; same sequence */ bau_desc->header.msg_type = MSG_RETRY; stat->s_retry_messages++; } -- cgit v1.2.1 From 450a007eebaf430426ea8f89bbc3f287949905b2 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: BAU broadcast to the local hub Make the Broadcast Assist Unit driver use the BAU for TLB shootdowns of cpu's on the local uvhub. It was previously thought that IPI might be faster to the cpu's on the local hub. But the IPI operation would have to follow the completion of the BAU broadcast anyway. So we broadcast to the local uvhub in all cases except when the current cpu was the only local cpu in the mask. This simplifies uv_flush_send_and_wait() in that it returns either all shootdowns complete, or none. Adjust the statistics to account for shootdowns on the local uvhub. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 138 ++++++++++++++++++----------------------------- 1 file changed, 53 insertions(+), 85 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 4cb14dbd7fa3..a1615058fad3 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -400,10 +400,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc, unsigned long mmr_offset, int right_shift, int this_cpu, struct bau_control *bcp, struct bau_control *smaster, long try) { - int relaxes = 0; unsigned long descriptor_status; - unsigned long mmr; - unsigned long mask; cycles_t ttime; struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster; @@ -524,25 +521,19 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) * The flush_mask contains the cpus the broadcast is to be sent to, plus * cpus that are on the local uvhub. * - * Returns NULL if all flushing represented in the mask was done. The mask - * is zeroed. - * Returns @flush_mask if some remote flushing remains to be done. The - * mask will have some bits still set, representing any cpus on the local - * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed. + * Returns 0 if all flushing represented in the mask was done. + * Returns 1 if it gives up entirely and the original cpu mask is to be + * returned to the kernel. */ -const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, - struct cpumask *flush_mask, - struct bau_control *bcp) +int uv_flush_send_and_wait(struct bau_desc *bau_desc, + struct cpumask *flush_mask, struct bau_control *bcp) { int right_shift; - int uvhub; - int bit; int completion_status = 0; int seq_number = 0; long try = 0; int cpu = bcp->uvhub_cpu; int this_cpu = bcp->cpu; - int this_uvhub = bcp->uvhub; unsigned long mmr_offset; unsigned long index; cycles_t time1; @@ -552,10 +543,6 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, struct bau_control *smaster = bcp->socket_master; struct bau_control *hmaster = bcp->uvhub_master; - /* - * Spin here while there are hmaster->max_bau_concurrent or more active - * descriptors. This is the per-uvhub 'throttle'. - */ if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, &hmaster->active_descriptor_count, hmaster->max_bau_concurrent)) { @@ -591,9 +578,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); - uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); - try++; completion_status = uv_wait_completion(bau_desc, mmr_offset, right_shift, this_cpu, bcp, smaster, try); @@ -652,16 +637,9 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, (hmaster->max_bau_concurrent < hmaster->max_bau_concurrent_constant)) hmaster->max_bau_concurrent++; - - /* - * hold any cpu not timing out here; no other cpu currently held by - * the 'throttle' should enter the activation code - */ while (hmaster->uvhub_quiesce) cpu_relax(); atomic_dec(&hmaster->active_descriptor_count); - - /* guard against cycles wrap */ if (time2 > time1) { elapsed = time2 - time1; stat->s_time += elapsed; @@ -674,32 +652,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, } } } else - stat->s_requestor--; /* don't count this one */ + stat->s_requestor--; if (completion_status == FLUSH_COMPLETE && try > 1) stat->s_retriesok++; else if (completion_status == FLUSH_GIVEUP) { - /* - * Cause the caller to do an IPI-style TLB shootdown on - * the target cpu's, all of which are still in the mask. - */ stat->s_giveup++; - return flush_mask; + return 1; } - - /* - * Success, so clear the remote cpu's from the mask so we don't - * use the IPI method of shootdown on them. - */ - for_each_cpu(bit, flush_mask) { - uvhub = uv_cpu_to_blade_id(bit); - if (uvhub == this_uvhub) - continue; - cpumask_clear_cpu(bit, flush_mask); - } - if (!cpumask_empty(flush_mask)) - return flush_mask; - - return NULL; + return 0; } /** @@ -731,10 +691,11 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long va, unsigned int cpu) { - int remotes; int tcpu; int uvhub; int locals = 0; + int remotes = 0; + int hubs = 0; struct bau_desc *bau_desc; struct cpumask *flush_mask; struct ptc_stats *stat; @@ -768,54 +729,52 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, /* * Each sending cpu has a per-cpu mask which it fills from the caller's - * cpu mask. Only remote cpus are converted to uvhubs and copied. + * cpu mask. All cpus are converted to uvhubs and copied to the + * activation descriptor. */ flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); - /* - * copy cpumask to flush_mask, removing current cpu - * (current cpu should already have been flushed by the caller and - * should never be returned if we return flush_mask) - */ + /* don't actually do a shootdown of the local cpu */ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); if (cpu_isset(cpu, *cpumask)) - locals++; /* current cpu was targeted */ + stat->s_ntargself++; bau_desc = bcp->descriptor_base; bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - remotes = 0; + + /* cpu statistics */ for_each_cpu(tcpu, flush_mask) { uvhub = uv_cpu_to_blade_id(tcpu); - if (uvhub == bcp->uvhub) { - locals++; - continue; - } bau_uvhub_set(uvhub, &bau_desc->distribution); - remotes++; - } - if (remotes == 0) { - /* - * No off_hub flushing; return status for local hub. - * Return the caller's mask if all were local (the current - * cpu may be in that mask). - */ - if (locals) - return cpumask; + if (uvhub == bcp->uvhub) + locals++; else - return NULL; + remotes++; } + if ((locals + remotes) == 0) + return NULL; stat->s_requestor++; - stat->s_ntargcpu += remotes; + stat->s_ntargcpu += remotes + locals; + stat->s_ntargremotes += remotes; + stat->s_ntarglocals += locals; remotes = bau_uvhub_weight(&bau_desc->distribution); - stat->s_ntarguvhub += remotes; - if (remotes >= 16) + + /* uvhub statistics */ + hubs = bau_uvhub_weight(&bau_desc->distribution); + if (locals) { + stat->s_ntarglocaluvhub++; + stat->s_ntargremoteuvhub += (hubs - 1); + } else + stat->s_ntargremoteuvhub += hubs; + stat->s_ntarguvhub += hubs; + if (hubs >= 16) stat->s_ntarguvhub16++; - else if (remotes >= 8) + else if (hubs >= 8) stat->s_ntarguvhub8++; - else if (remotes >= 4) + else if (hubs >= 4) stat->s_ntarguvhub4++; - else if (remotes >= 2) + else if (hubs >= 2) stat->s_ntarguvhub2++; else stat->s_ntarguvhub1++; @@ -824,10 +783,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, bau_desc->payload.sending_cpu = cpu; /* - * uv_flush_send_and_wait returns null if all cpu's were messaged, or - * the adjusted flush_mask if any cpu's were not messaged. + * uv_flush_send_and_wait returns 0 if all cpu's were messaged, + * or 1 if it gave up and the original cpumask should be returned. */ - return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); + if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) + return NULL; + else + return cpumask; } /* @@ -976,9 +938,11 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) if (!cpu) { seq_printf(file, - "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); + "# cpu sent stime self locals remotes ncpus localhub "); + seq_printf(file, + "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); + "numuvhubs4 numuvhubs2 numuvhubs1 dto "); seq_printf(file, "retries rok resetp resett giveup sto bz throt "); seq_printf(file, @@ -994,10 +958,14 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", cpu, stat->s_requestor, cycles_2_us(stat->s_time), - stat->s_ntarguvhub, stat->s_ntarguvhub16, + stat->s_ntargself, stat->s_ntarglocals, + stat->s_ntargremotes, stat->s_ntargcpu, + stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, + stat->s_ntarguvhub, stat->s_ntarguvhub16); + seq_printf(file, "%ld %ld %ld %ld %ld ", stat->s_ntarguvhub8, stat->s_ntarguvhub4, stat->s_ntarguvhub2, stat->s_ntarguvhub1, - stat->s_ntargcpu, stat->s_dtimeout); + stat->s_dtimeout); seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", stat->s_retry_messages, stat->s_retriesok, stat->s_resets_plug, stat->s_resets_timeout, -- cgit v1.2.1 From f6d8a56693426b1f29ff5cafda8be0d65e4e1870 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Modularize BAU send and wait Streamline the large uv_flush_send_and_wait() function by use of a couple of helper functions. And remove some excess comments. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 82 ++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 38 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index a1615058fad3..abf3c31f14cf 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -484,6 +484,47 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) return 1; } +/* + * Our retries are blocked by all destination swack resources being + * in use, and a timeout is pending. In that case hardware immediately + * returns the ERROR that looks like a destination timeout. + */ +static void +destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp, + struct bau_control *hmaster, struct ptc_stats *stat) +{ + udelay(bcp->plugged_delay); + bcp->plugged_tries++; + if (bcp->plugged_tries >= bcp->plugsb4reset) { + bcp->plugged_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); + uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); + spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; + stat->s_resets_plug++; + } +} + +static void +destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp, + struct bau_control *hmaster, struct ptc_stats *stat) +{ + hmaster->max_bau_concurrent = 1; + bcp->timeout_tries++; + if (bcp->timeout_tries >= bcp->timeoutsb4reset) { + bcp->timeout_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); + uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); + spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; + stat->s_resets_timeout++; + } +} + /* * Completions are taking a very long time due to a congested numalink * network. @@ -518,7 +559,7 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) * * Send a broadcast and wait for it to complete. * - * The flush_mask contains the cpus the broadcast is to be sent to, plus + * The flush_mask contains the cpus the broadcast is to be sent to including * cpus that are on the local uvhub. * * Returns 0 if all flushing represented in the mask was done. @@ -553,7 +594,6 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, &hmaster->active_descriptor_count, hmaster->max_bau_concurrent)); } - while (hmaster->uvhub_quiesce) cpu_relax(); @@ -584,40 +624,9 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, right_shift, this_cpu, bcp, smaster, try); if (completion_status == FLUSH_RETRY_PLUGGED) { - /* - * Our retries may be blocked by all destination swack - * resources being consumed, and a timeout pending. In - * that case hardware immediately returns the ERROR - * that looks like a destination timeout. - */ - udelay(bcp->plugged_delay); - bcp->plugged_tries++; - if (bcp->plugged_tries >= bcp->plugsb4reset) { - bcp->plugged_tries = 0; - quiesce_local_uvhub(hmaster); - spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, - this_cpu); - spin_unlock(&hmaster->queue_lock); - end_uvhub_quiesce(hmaster); - bcp->ipi_attempts++; - stat->s_resets_plug++; - } + destination_plugged(bau_desc, bcp, hmaster, stat); } else if (completion_status == FLUSH_RETRY_TIMEOUT) { - hmaster->max_bau_concurrent = 1; - bcp->timeout_tries++; - udelay(TIMEOUT_DELAY); - if (bcp->timeout_tries >= bcp->timeoutsb4reset) { - bcp->timeout_tries = 0; - quiesce_local_uvhub(hmaster); - spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, - this_cpu); - spin_unlock(&hmaster->queue_lock); - end_uvhub_quiesce(hmaster); - bcp->ipi_attempts++; - stat->s_resets_timeout++; - } + destination_timeout(bau_desc, bcp, hmaster, stat); } if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { bcp->ipi_attempts = 0; @@ -628,10 +637,8 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, } while ((completion_status == FLUSH_RETRY_PLUGGED) || (completion_status == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); - bcp->plugged_tries = 0; bcp->timeout_tries = 0; - if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > bcp->complete_threshold) && (hmaster->max_bau_concurrent < @@ -740,7 +747,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, bau_desc = bcp->descriptor_base; bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; - bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); /* cpu statistics */ -- cgit v1.2.1 From c9cf4dbb4d9ca715d8fedf13301a53296429abc6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 19 May 2010 21:35:17 +0200 Subject: x86: Unify dumpstack.h and stacktrace.h arch/x86/include/asm/stacktrace.h and arch/x86/kernel/dumpstack.h declare headers of objects that deal with the same topic. Actually most of the files that include stacktrace.h also include dumpstack.h Although dumpstack.h seems more reserved for internals of stack traces, those are quite often needed to define specialized stack trace operations. And perf event arch headers are going to need access to such low level operations anyway. So don't continue to bother with dumpstack.h as it's not anymore about isolated deep internals. v2: fix struct stack_frame definition conflict in sysprof Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Soeren Sandmann --- arch/x86/kernel/cpu/perf_event.c | 2 -- arch/x86/kernel/dumpstack.c | 1 - arch/x86/kernel/dumpstack.h | 56 ---------------------------------------- arch/x86/kernel/dumpstack_32.c | 2 -- arch/x86/kernel/dumpstack_64.c | 1 - arch/x86/kernel/stacktrace.c | 7 ++--- 6 files changed, 4 insertions(+), 65 deletions(-) delete mode 100644 arch/x86/kernel/dumpstack.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c77586061bcb..9632fb61e8f9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1585,8 +1585,6 @@ static const struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack_bp, }; -#include "../dumpstack.h" - static void perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c89a386930b7..6e8752c1bd52 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -18,7 +18,6 @@ #include -#include "dumpstack.h" int panic_on_unrecovered_nmi; int panic_on_io_nmi; diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h deleted file mode 100644 index e1a93be4fd44..000000000000 --- a/arch/x86/kernel/dumpstack.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - */ - -#ifndef DUMPSTACK_H -#define DUMPSTACK_H - -#ifdef CONFIG_X86_32 -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) -#else -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) -#endif - -#include - -extern void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); - -extern void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); - -extern unsigned int code_bytes; - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -struct stack_frame_ia32 { - u32 next_frame; - u32 return_address; -}; - -static inline unsigned long rewind_frame_pointer(int n) -{ - struct stack_frame *frame; - - get_bp(frame); - -#ifdef CONFIG_FRAME_POINTER - while (n--) { - if (probe_kernel_address(&frame->next_frame, frame)) - break; - } -#endif - - return (unsigned long)frame; -} - -#endif /* DUMPSTACK_H */ diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 11540a189d93..0f6376ffa2d9 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,8 +16,6 @@ #include -#include "dumpstack.h" - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 272c9f1f05f3..57a21f11c791 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,7 +16,6 @@ #include -#include "dumpstack.h" #define N_EXCEPTION_STACKS_END \ (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 922eefbb3f6c..ea54d029fe27 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ -struct stack_frame { +struct stack_frame_user { const void __user *next_fp; unsigned long ret_addr; }; -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +static int +copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; @@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { - struct stack_frame frame; + struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; -- cgit v1.2.1 From b0f82b81fe6bbcf78d478071f33e44554726bc81 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 20 May 2010 07:47:21 +0200 Subject: perf: Drop the skip argument from perf_arch_fetch_regs_caller Drop this argument now that we always want to rewind only to the state of the first caller. It means frame pointers are not necessary anymore to reliably get the source of an event. But this also means we need this helper to be a macro now, as an inline function is not an option since we need to know when to provide a default implentation. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul Mackerras Cc: David Miller Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo --- arch/x86/kernel/cpu/perf_event.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9632fb61e8f9..2c075fe573d0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1706,22 +1706,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) -{ - regs->ip = ip; - /* - * perf_arch_fetch_caller_regs adds another call, we need to increment - * the skip level - */ - regs->bp = rewind_frame_pointer(skip + 1); - regs->cs = __KERNEL_CS; - /* - * We abuse bit 3 to pass exact information, see perf_misc_flags - * and the comment with PERF_EFLAGS_EXACT. - */ - regs->flags = 0; -} - unsigned long perf_instruction_pointer(struct pt_regs *regs) { unsigned long ip; -- cgit v1.2.1 From 8d2cacbbb8deadfae78aa16e4e1ee619bdd7019e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 May 2010 17:49:05 +0200 Subject: perf: Cleanup {start,commit,cancel}_txn details Clarify some of the transactional group scheduling API details and change it so that a successfull ->commit_txn also closes the transaction. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: <1274803086.5882.1752.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5db5b7d65a18..af04c6fa59cb 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -969,7 +969,7 @@ static int x86_pmu_enable(struct perf_event *event) * skip the schedulability test here, it will be peformed * at commit time(->commit_txn) as a whole */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) goto out; ret = x86_pmu.schedule_events(cpuc, n, assign); @@ -1096,7 +1096,7 @@ static void x86_pmu_disable(struct perf_event *event) * The events never got scheduled and ->cancel_txn will truncate * the event_list. */ - if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + if (cpuc->group_flag & PERF_EVENT_TXN) return; x86_pmu_stop(event); @@ -1388,7 +1388,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - cpuc->group_flag |= PERF_EVENT_TXN_STARTED; + cpuc->group_flag |= PERF_EVENT_TXN; cpuc->n_txn = 0; } @@ -1401,7 +1401,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; + cpuc->group_flag &= ~PERF_EVENT_TXN; /* * Truncate the collected events. */ @@ -1435,11 +1435,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) */ memcpy(cpuc->assign, assign, n*sizeof(int)); - /* - * Clear out the txn count so that ->cancel_txn() which gets - * run after ->commit_txn() doesn't undo things. - */ - cpuc->n_txn = 0; + cpuc->group_flag &= ~PERF_EVENT_TXN; return 0; } -- cgit v1.2.1 From 68aa00ac0a82e9a876c799bf6be7622b8f1c8517 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 3 Jun 2010 01:23:04 +0400 Subject: perf, x86: Make a second write to performance counter if needed On Netburst PMU we need a second write to a performance counter due to cpu erratum. A simple flag test instead of alternative instructions was choosen because wrmsrl is already a macro and if virtualization is turned on will need an additional wrapper call which is more expencise. nb: we should propably switch to jump-labels as only this facility reach the mainline. Signed-off-by: Cyrill Gorcunov Signed-off-by: Peter Zijlstra Cc: Robert Richter Cc: Lin Ming Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20100602212304.GC5264@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 12 +++++++++++- arch/x86/kernel/cpu/perf_event_p4.c | 9 +++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index af04c6fa59cb..79e199843db6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -220,6 +220,7 @@ struct x86_pmu { struct perf_event *event); struct event_constraint *event_constraints; void (*quirks)(void); + int perfctr_second_write; int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); @@ -925,8 +926,17 @@ x86_perf_event_set_period(struct perf_event *event) */ atomic64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base + idx, + wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); + + /* + * Due to erratum on certan cpu we need + * a second write to be sure the register + * is updated properly + */ + if (x86_pmu.perfctr_second_write) { + wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); + } perf_event_update_userpage(event); diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ae85d69644d1..9286e736a70a 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -829,6 +829,15 @@ static __initconst const struct x86_pmu p4_pmu = { .max_period = (1ULL << 39) - 1, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, + /* + * This handles erratum N15 in intel doc 249199-029, + * the counter may not be updated correctly on write + * so we need a second write operation to do the trick + * (the official workaround didn't work) + * + * the former idea is taken from OProfile code + */ + .perfctr_second_write = 1, }; static __init int p4_pmu_init(void) -- cgit v1.2.1 From e78505958cf123048fb48cb56b79cebb8edd15fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 14:43:08 +0200 Subject: perf: Convert perf_event to local_t Since now all modification to event->count (and ->prev_count and ->period_left) are local to a cpu, change then to local64_t so we avoid the LOCK'ed ops. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 79e199843db6..2d0d29069275 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -296,10 +296,10 @@ x86_perf_event_update(struct perf_event *event) * count to the generic event atomically: */ again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); rdmsrl(hwc->event_base + idx, new_raw_count); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; @@ -314,8 +314,8 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -439,7 +439,7 @@ static int x86_setup_perfctr(struct perf_event *event) if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } else { /* * If we have a PMU initialized but no APIC @@ -886,7 +886,7 @@ static int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; @@ -898,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event) */ if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } @@ -924,7 +924,7 @@ x86_perf_event_set_period(struct perf_event *event) * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); -- cgit v1.2.1 From 147ec4d2361e355ab32499f739cc24845ceb89da Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Jun 2010 21:32:39 +0200 Subject: x86: Make save_stack_address() !CONFIG_FRAME_POINTER friendly If CONFIG_FRAME_POINTER=n, print_context_stack() shouldn't neglect the non-reliable addresses on stack, this is all we have if dump_trace(bp) is called with the wrong or zero bp. For example, /proc/pid/stack doesn't work if CONFIG_FRAME_POINTER=n. This patch obviously has no effect if CONFIG_FRAME_POINTER=y, otherwise it reverts 1650743c "x86: don't save unreliable stack trace entries". Also, remove the unnecessary type-cast. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Arjan van de Ven Cc: Vegard Nossum Cc: Ingo Molnar Cc: Andrew Morton LKML-Reference: <20100603193239.GA31530@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/stacktrace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index ea54d029fe27..abc321d55870 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -26,8 +26,10 @@ static int save_stack_stack(void *data, char *name) static void save_stack_address(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = data; +#ifdef CONFIG_FRAME_POINTER if (!reliable) return; +#endif if (trace->skip > 0) { trace->skip--; return; @@ -39,9 +41,11 @@ static void save_stack_address(void *data, unsigned long addr, int reliable) static void save_stack_address_nosched(void *data, unsigned long addr, int reliable) { - struct stack_trace *trace = (struct stack_trace *)data; + struct stack_trace *trace = data; +#ifdef CONFIG_FRAME_POINTER if (!reliable) return; +#endif if (in_sched_functions(addr)) return; if (trace->skip > 0) { -- cgit v1.2.1 From 018378c55b03f88ff513aba4e0e93b8d4a9cf241 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Jun 2010 21:32:43 +0200 Subject: x86: Unify save_stack_address() and save_stack_address_nosched() Cleanup. Factor the common code in save_stack_address() and save_stack_address_nosched(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Arjan van de Ven Cc: Vegard Nossum Cc: Ingo Molnar LKML-Reference: <20100603193243.GA31534@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/stacktrace.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index abc321d55870..b53c525368a7 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -23,13 +23,16 @@ static int save_stack_stack(void *data, char *name) return 0; } -static void save_stack_address(void *data, unsigned long addr, int reliable) +static void +__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) { struct stack_trace *trace = data; #ifdef CONFIG_FRAME_POINTER if (!reliable) return; #endif + if (nosched && in_sched_functions(addr)) + return; if (trace->skip > 0) { trace->skip--; return; @@ -38,22 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable) trace->entries[trace->nr_entries++] = addr; } +static void save_stack_address(void *data, unsigned long addr, int reliable) +{ + return __save_stack_address(data, addr, reliable, false); +} + static void save_stack_address_nosched(void *data, unsigned long addr, int reliable) { - struct stack_trace *trace = data; -#ifdef CONFIG_FRAME_POINTER - if (!reliable) - return; -#endif - if (in_sched_functions(addr)) - return; - if (trace->skip > 0) { - trace->skip--; - return; - } - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = addr; + return __save_stack_address(data, addr, reliable, true); } static const struct stacktrace_ops save_stack_ops = { -- cgit v1.2.1 From d6d4d4205cf4ce4ba13bc320305afbda25303496 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 3 Jun 2010 12:07:46 +0200 Subject: x86, xsave: Cleanup return codes in check_for_xstate() The places which call check_for_xstate() only care about zero or non-zero so this patch doesn't change how the code runs, but it's a cleanup. The main reason for this patch is that I'm looking for places which don't return -EFAULT for copy_from_user() failures. Signed-off-by: Dan Carpenter LKML-Reference: <20100603100746.GU5483@bicker> Signed-off-by: H. Peter Anvin Cc: Suresh Siddha --- arch/x86/kernel/xsave.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 37e68fc5e24a..980149867a19 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -36,15 +36,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], sizeof(struct _fpx_sw_bytes)); - if (err) - return err; + return -EFAULT; /* * First Magic check failed. */ if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) - return -1; + return -EINVAL; /* * Check for error scenarios. @@ -52,19 +51,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, if (fx_sw_user->xstate_size < min_xstate_size || fx_sw_user->xstate_size > xstate_size || fx_sw_user->xstate_size > fx_sw_user->extended_size) - return -1; + return -EINVAL; err = __get_user(magic2, (__u32 *) (((void *)fpstate) + fx_sw_user->extended_size - FP_XSTATE_MAGIC2_SIZE)); + if (err) + return err; /* * Check for the presence of second magic word at the end of memory * layout. This detects the case where the user just copied the legacy * fpstate layout with out copying the extended state information * in the memory layout. */ - if (err || magic2 != FP_XSTATE_MAGIC2) - return -1; + if (magic2 != FP_XSTATE_MAGIC2) + return -EFAULT; return 0; } -- cgit v1.2.1 From 8cc1176e5de534d55cb26ff0cef3fd0d6ad8c3c0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 2 Jun 2010 18:18:40 +0200 Subject: x86, cacheinfo: Carve out L3 cache slot accessors This is in preparation for disabling L3 cache indices after having received correctable ECCs in the L3 cache. Now we allow for initial setting of a disabled index slot (write once) and deny writing new indices to it after it has been disabled. Also, we deny using both slots to disable one and the same index. Userspace can restore the previously disabled indices by rewriting those sysfs entries when booting. Cleanup and reorganize code while at it. Signed-off-by: Borislav Petkov LKML-Reference: <20100602161840.GI18327@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 108 ++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 26 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 33eae2062cf5..898c2f4eab88 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) return l3; } -static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, + int index) { int node; @@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) this_leaf->l3 = l3_caches[node]; } +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, unsigned int slot) { - struct pci_dev *dev = this_leaf->l3->dev; - unsigned int reg = 0; + int index; if (!this_leaf->l3 || !this_leaf->l3->can_disable) return -EINVAL; - if (!dev) - return -EINVAL; + index = amd_get_l3_disable_slot(this_leaf->l3, slot); + if (index >= 0) + return sprintf(buf, "%d\n", index); - pci_read_config_dword(dev, 0x1BC + slot * 4, ®); - return sprintf(buf, "0x%08x\n", reg); + return sprintf(buf, "FREE\n"); } #define SHOW_CACHE_DISABLE(slot) \ @@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, } } - -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, - unsigned int slot) +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, + unsigned long index) { - struct pci_dev *dev = this_leaf->l3->dev; - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - unsigned long val = 0; + int ret = 0; #define SUBCACHE_MASK (3UL << 20) #define SUBCACHE_INDEX 0xfff - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + /* + * check whether this slot is already used or + * the index is already disabled + */ + ret = amd_get_l3_disable_slot(l3, slot); + if (ret >= 0) return -EINVAL; + /* + * check whether the other slot has disabled the + * same index already + */ + if (index == amd_get_l3_disable_slot(l3, !slot)) + return -EINVAL; + + /* do not allow writes outside of allowed bits */ + if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((index & SUBCACHE_INDEX) > l3->indices)) + return -EINVAL; + + amd_l3_disable_index(l3, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, + unsigned int slot) +{ + unsigned long val = 0; + int cpu, err = 0; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!dev) + if (!this_leaf->l3 || !this_leaf->l3->can_disable) return -EINVAL; - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; + cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - /* do not allow writes outside of allowed bits */ - if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((val & SUBCACHE_INDEX) > this_leaf->l3->indices)) + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - amd_l3_disable_index(this_leaf->l3, cpu, slot, val); - + err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); + if (err) { + if (err == -EEXIST) + printk(KERN_WARNING "L3 disable slot %d in use!\n", + slot); + return err; + } return count; } @@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, #else /* CONFIG_CPU_SUP_AMD */ static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) { }; #endif /* CONFIG_CPU_SUP_AMD */ @@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - amd_check_l3_disable(index, this_leaf); + amd_check_l3_disable(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } -- cgit v1.2.1 From 12d8a961289644d265d8b3e88201878837c3b814 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 2 Jun 2010 20:29:21 +0200 Subject: x86, AMD: Extend support to future families Extend support to future families, and in particular: * extend direct mapping split of Tseg SMM area. * extend K8 flavored alternatives (NOPS). * rep movs* prefix is fast in ucode. Signed-off-by: Borislav Petkov LKML-Reference: <20100602182921.GA21557@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e485825130d2..12b9cff047c1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } - if (c->x86 == 0x10 || c->x86 == 0x11) + if (c->x86 >= 0x10) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ @@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; } - if (c->x86 >= 0xf && c->x86 <= 0x11) + if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has_xmm2) { @@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) fam10h_check_enable_mmcfg(); } - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + if (c == &boot_cpu_data && c->x86 >= 0xf) { unsigned long long tseg; /* -- cgit v1.2.1 From 1f9a0bd4989fd16842ad71fc89240b48ab191446 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 8 Jun 2010 14:09:08 +0800 Subject: x86, mce: Rename MSR_IA32_MCx_CTL2 value Rename CMCI_EN to MCI_CTL2_CMCI_EN and CMCI_THRESHOLD_MASK to MCI_CTL2_CMCI_THRESHOLD_MASK to make naming consistent. Signed-off-by: Huang Ying LKML-Reference: <1275977348.3444.659.camel@yhuang-dev.sh.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 62b48e40920a..faf7b2919a87 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -95,19 +95,19 @@ static void cmci_discover(int banks, int boot) rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ - if (val & CMCI_EN) { + if (val & MCI_CTL2_CMCI_EN) { if (test_and_clear_bit(i, owned) && !boot) print_update("SHD", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } - val |= CMCI_EN | CMCI_THRESHOLD; + val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & CMCI_EN) { + if (val & MCI_CTL2_CMCI_EN) { if (!test_and_set_bit(i, owned) && !boot) print_update("CMCI", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); @@ -155,7 +155,7 @@ void cmci_clear(void) continue; /* Disable CMCI */ rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } -- cgit v1.2.1 From 3c417588603e5411f29d22a40f3b5ff71529a4f0 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 8 Jun 2010 14:09:10 +0800 Subject: x86, mce: Fix MSR_IA32_MCI_CTL2 CMCI threshold setup It is reported that CMCI is not raised when number of corrected error reaches preset threshold. After inspection, it is found that MSR_IA32_MCI_CTL2 threshold field is not setup properly. This patch fixed it. Value of MCI_CTL2_CMCI_THRESHOLD_MASK is fixed according to x86_64 Software Developer's Manual too. Reported-by: Shaohui Zheng Signed-off-by: Huang Ying LKML-Reference: <1275977350.3444.660.camel@yhuang-dev.sh.intel.com> Reviewed-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index faf7b2919a87..6fcd0936194f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -102,6 +102,7 @@ static void cmci_discover(int banks, int boot) continue; } + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); -- cgit v1.2.1 From a2d7b0d4852536273b65d16fe179c65184fe5e2d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 8 Jun 2010 14:35:39 +0800 Subject: x86, mce: Use HW_ERR in MCE handler Use HW_ERR printk prefix in MCE handler. To make it more explicit that this is hardware error instead of software error. Signed-off-by: Huang Ying LKML-Reference: <1275978939.3444.668.camel@yhuang-dev.sh.intel.com> Reviewed-by: Hidetoshi Seto Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18cc42562250..094b228c8b06 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); static int default_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { - pr_emerg("No human readable MCE decoding support on this CPU type.\n"); - pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); + pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); + pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); return NOTIFY_STOP; } @@ -211,11 +211,11 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { - pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { - pr_emerg("RIP%s %02x:<%016Lx> ", + pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", m->cs, m->ip); @@ -224,14 +224,14 @@ static void print_mce(struct mce *m) pr_cont("\n"); } - pr_emerg("TSC %llx ", m->tsc); + pr_emerg(HW_ERR "TSC %llx ", m->tsc); if (m->addr) pr_cont("ADDR %llx ", m->addr); if (m->misc) pr_cont("MISC %llx ", m->misc); pr_cont("\n"); - pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); /* @@ -241,16 +241,6 @@ static void print_mce(struct mce *m) atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); } -static void print_mce_head(void) -{ - pr_emerg("\nHARDWARE ERROR\n"); -} - -static void print_mce_tail(void) -{ - pr_emerg("This is not a software problem!\n"); -} - #define PANIC_TIMEOUT 5 /* 5 seconds */ static atomic_t mce_paniced; @@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp) if (atomic_inc_return(&mce_fake_paniced) > 1) return; } - print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { struct mce *m = &mcelog.entry[i]; @@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp) apei_err = apei_write_mce(final); } if (cpu_missing) - printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); - print_mce_tail(); + pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); if (exp) - printk(KERN_EMERG "Machine check: %s\n", exp); + pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mce_panic_timeout; panic(msg); } else - printk(KERN_EMERG "Fake kernel panic: %s\n", msg); + pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -1220,7 +1208,7 @@ int mce_notify_irq(void) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) - printk(KERN_INFO "Machine check events logged\n"); + pr_info(HW_ERR "Machine check events logged\n"); return 1; } -- cgit v1.2.1 From ec8c27e04f89a7575ca2c4facb99152e03d6a99c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 30 Apr 2010 06:45:36 -0700 Subject: mce: convert to rcu_dereference_index_check() The mce processing applies rcu_dereference_check() to integers used as array indices. This patch therefore moves mce to the new RCU API rcu_dereference_index_check() that avoids the sparse processing that would otherwise result in compiler errors. Signed-off-by: Paul E. McKenney Cc: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18cc42562250..0e78657e29c5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -51,7 +51,7 @@ static DEFINE_MUTEX(mce_read_mutex); #define rcu_dereference_check_mce(p) \ - rcu_dereference_check((p), \ + rcu_dereference_index_check((p), \ rcu_read_lock_sched_held() || \ lockdep_is_held(&mce_read_mutex)) -- cgit v1.2.1 From 421f91d21ad6f799dc7b489bb33cc560ccc56f98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 11 Jun 2010 12:17:00 +0200 Subject: fix typos concerning "initiali[zs]e" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uwe Kleine-König Signed-off-by: Jiri Kosina --- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/head32.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e5a4a1e01618..192cd7ee35cc 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -459,7 +459,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask) } /* - * Setup the local APIC timer for this CPU. Copy the initilized values + * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. */ static void __cpuinit setup_APIC_timer(void) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index b2e246037392..784360c0625c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -20,7 +20,7 @@ static void __init i386_default_early_setup(void) { - /* Initilize 32bit specific setup functions */ + /* Initialize 32bit specific setup functions */ x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; -- cgit v1.2.1 From 23016bf0d25d62c45d8b8f61d55b290d704f7a79 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 3 Jun 2010 23:22:28 -0400 Subject: x86: Look for IA32_ENERGY_PERF_BIAS support The new IA32_ENERGY_PERF_BIAS MSR allows system software to give hardware a hint whether OS policy favors more power saving, or more performance. This allows the OS to have some influence on internal hardware power/performance tradeoffs where the OS has previously had no influence. The support for this feature is indicated by CPUID.06H.ECX.bit3, as documented in the Intel Architectures Software Developer's Manual. This patch discovers support of this feature and displays it as "epb" in /proc/cpuinfo. Signed-off-by: Venkatesh Pallipadi LKML-Reference: Signed-off-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/addon_cpuid_features.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 10fa5684a662..7369b4c2c55a 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -33,6 +33,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, -- cgit v1.2.1 From c882e0feb937af4e5b991cbd1c81536f37053e86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20Sch=C3=B6ne?= Date: Mon, 14 Jun 2010 13:37:20 +0200 Subject: x86, perf: Add power_end event to process_*.c cpu_idle routine Systems using the idle thread from process_32.c and process_64.c do not generate power_end events which could be traced using perf. This patch adds the event generation for such systems. Signed-off-by: Robert Schoene Acked-by: Arjan van de Ven Cc: Peter Zijlstra LKML-Reference: <1276515440.5441.45.camel@localhost> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 4 ++++ arch/x86/kernel/process_64.c | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8d128783af47..96586c3cbbbf 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -57,6 +57,8 @@ #include #include +#include + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); /* @@ -111,6 +113,8 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); + + trace_power_end(smp_processor_id()); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3c2422a99f1f..3d9ea531ddd1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -51,6 +51,8 @@ #include #include +#include + asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); @@ -138,6 +140,9 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); + + trace_power_end(smp_processor_id()); + /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ -- cgit v1.2.1 From fd699c76552bbfa66631f019be415a87dbb08237 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Fri, 18 Jun 2010 17:46:53 -0400 Subject: x86, olpc: Add support for calling into OpenFirmware Add support for saving OFW's cif, and later calling into it to run OFW commands. OFW remains resident in memory, living within virtual range 0xff800000 - 0xffc00000. A single page directory entry points to the pgdir that OFW actually uses, so rather than saving the entire page table, we grab and install that one entry permanently in the kernel's page table. This is currently only used by the OLPC XO. Note that this particular calling convention breaks PAE and PAT, and so cannot be used on newer x86 hardware. Signed-off-by: Andres Salomon LKML-Reference: <20100618174653.7755a39a@dev.queued.net> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/head_32.S | 6 +++ arch/x86/kernel/olpc.c | 12 +++--- arch/x86/kernel/olpc_ofw.c | 104 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 6 +++ 5 files changed, 122 insertions(+), 7 deletions(-) create mode 100644 arch/x86/kernel/olpc_ofw.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e77b22083721..0925676266bd 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o obj-$(CONFIG_X86_MRST) += mrst.o microcode-y := microcode_core.o diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 37c3d4b17d85..ff4c453e13f3 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -131,6 +131,12 @@ ENTRY(startup_32) movsl 1: +#ifdef CONFIG_OLPC_OPENFIRMWARE + /* save OFW's pgdir table for later use when calling into OFW */ + movl %cr3, %eax + movl %eax, pa(olpc_ofw_pgd) +#endif + #ifdef CONFIG_PARAVIRT /* This is can only trip for a broken bootloader... */ cmpw $0x207, pa(boot_params + BP_version) diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 8297160c41b3..156605281f56 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -21,10 +21,7 @@ #include #include #include - -#ifdef CONFIG_OPEN_FIRMWARE -#include -#endif +#include struct olpc_platform_t olpc_platform_info; EXPORT_SYMBOL_GPL(olpc_platform_info); @@ -188,14 +185,15 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); -#ifdef CONFIG_OPEN_FIRMWARE +#ifdef CONFIG_OLPC_OPENFIRMWARE static void __init platform_detect(void) { size_t propsize; __be32 rev; + void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; + void *res[] = { &propsize }; - if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, - &propsize) || propsize != 4) { + if (olpc_ofw("getprop", args, res) || propsize != 4) { printk(KERN_ERR "ofw: getprop call failed!\n"); rev = cpu_to_be32(0); } diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c new file mode 100644 index 000000000000..469ee4384295 --- /dev/null +++ b/arch/x86/kernel/olpc_ofw.c @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* address of OFW callback interface; will be NULL if OFW isn't found */ +static int (*olpc_ofw_cif)(int *); + +/* page dir entry containing OFW's pgdir table; filled in by head_32.S */ +u32 olpc_ofw_pgd __initdata; + +static DEFINE_SPINLOCK(ofw_lock); + +#define MAXARGS 10 + +void __init setup_olpc_ofw_pgd(void) +{ + pgd_t *base, *ofw_pde; + + if (!olpc_ofw_cif) + return; + + /* fetch OFW's PDE */ + base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); + if (!base) { + printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n"); + olpc_ofw_cif = NULL; + return; + } + ofw_pde = &base[OLPC_OFW_PDE_NR]; + + /* install OFW's PDE permanently into the kernel's pgtable */ + set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); + early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); +} + +int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res, + void **res) +{ + int ofw_args[MAXARGS + 3]; + unsigned long flags; + int ret, i, *p; + + BUG_ON(nr_args + nr_res > MAXARGS); + + if (!olpc_ofw_cif) + return -EIO; + + ofw_args[0] = (int)name; + ofw_args[1] = nr_args; + ofw_args[2] = nr_res; + + p = &ofw_args[3]; + for (i = 0; i < nr_args; i++, p++) + *p = (int)args[i]; + + /* call into ofw */ + spin_lock_irqsave(&ofw_lock, flags); + ret = olpc_ofw_cif(ofw_args); + spin_unlock_irqrestore(&ofw_lock, flags); + + if (!ret) { + for (i = 0; i < nr_res; i++, p++) + *((int *)res[i]) = *p; + } + + return ret; +} +EXPORT_SYMBOL_GPL(__olpc_ofw); + +/* OFW cif _should_ be above this address */ +#define OFW_MIN 0xff000000 + +/* OFW starts on a 1MB boundary */ +#define OFW_BOUND (1<<20) + +void __init olpc_ofw_detect(void) +{ + struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header; + unsigned long start; + + /* ensure OFW booted us by checking for "OFW " string */ + if (hdr->ofw_magic != OLPC_OFW_SIG) + return; + + olpc_ofw_cif = (int (*)(int *))hdr->cif_handler; + + if ((unsigned long)olpc_ofw_cif < OFW_MIN) { + printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n", + (unsigned long)olpc_ofw_cif); + olpc_ofw_cif = NULL; + return; + } + + /* determine where OFW starts in memory */ + start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND); + printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n", + (unsigned long)olpc_ofw_cif, (-start) >> 20); + reserve_top_address(-start); +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4ae4acbd031..b008e7883207 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -102,6 +102,7 @@ #include #include +#include #include #include @@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p) /* VMI may relocate the fixmap; do this before touching ioremap area */ vmi_init(); + /* OFW also may relocate the fixmap */ + olpc_ofw_detect(); + early_trap_init(); early_cpu_init(); early_ioremap_init(); + setup_olpc_ofw_pgd(); + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; -- cgit v1.2.1 From 75a9cac430a1bd2a5219c74508ca01b0ddfddc9a Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 23 Jun 2010 20:27:00 -0400 Subject: x86, olpc: Add comment about implicit optimization barrier Signed-off-by: Andres Salomon Cc: H. Peter Anvin LKML-Reference: <20100618174653.7755a39a@dev.queued.net> Signed-off-by: Ingo Molnar --- arch/x86/kernel/olpc_ofw.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c index 469ee4384295..f5d499fbe74f 100644 --- a/arch/x86/kernel/olpc_ofw.c +++ b/arch/x86/kernel/olpc_ofw.c @@ -35,6 +35,8 @@ void __init setup_olpc_ofw_pgd(void) /* install OFW's PDE permanently into the kernel's pgtable */ set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); + /* implicit optimization barrier here due to uninline function return */ + early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); } -- cgit v1.2.1 From 0c4519e825c9e2b6a8310deff8582f8c35bfbba9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Jun 2010 21:21:27 +0200 Subject: x86: Set resume bit before returning from breakpoint exception Instruction breakpoints trigger before the instruction executes, and returning back from the breakpoint handler brings us again to the instruction that breakpointed. This naturally bring to a breakpoint recursion. To solve this, x86 has the Resume Bit trick. When the cpu flags have the RF flag set, the next instruction won't trigger any instruction breakpoint, and once this instruction is executed, RF is cleared back. This let's us jump back to the instruction that triggered the breakpoint without recursion. Use this when an instruction breakpoint triggers. Signed-off-by: Frederic Weisbecker Cc: Will Deacon Cc: Prasad Cc: Mahesh Salgaonkar Cc: Paul Mackerras Cc: Ingo Molnar Cc: Jason Wessel --- arch/x86/kernel/hw_breakpoint.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index a8f1b803d2fd..eaa6ae2a010b 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -466,6 +466,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) perf_bp_event(bp, args->regs); + /* + * Set up resume flag to avoid breakpoint recursion when + * returning back to origin. + */ + if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) + args->regs->flags |= X86_EFLAGS_RF; + rcu_read_unlock(); } /* -- cgit v1.2.1 From f7809daf64bf119fef70af172db6a0636fa51f92 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 24 Jun 2010 10:00:24 +0200 Subject: x86: Support for instruction breakpoints Instruction breakpoints need to have a specific length of 0 to be working. Bring this support but also take care the user is not trying to set an unsupported length, like a range breakpoint for example. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Prasad Cc: Mahesh Salgaonkar Cc: Will Deacon Cc: Jason Wessel --- arch/x86/kernel/hw_breakpoint.c | 44 +++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index eaa6ae2a010b..a474ec37c32f 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -208,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type, { /* Len */ switch (x86_len) { + case X86_BREAKPOINT_LEN_X: + *gen_len = sizeof(long); + break; case X86_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; @@ -251,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp) info->address = bp->attr.bp_addr; + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + /* + * x86 inst breakpoints need to have a specific undefined len. + * But we still need to check userspace is not trying to setup + * an unsupported length, to get a range breakpoint for example. + */ + if (bp->attr.bp_len == sizeof(long)) { + info->len = X86_BREAKPOINT_LEN_X; + return 0; + } + default: + return -EINVAL; + } + /* Len */ switch (bp->attr.bp_len) { case HW_BREAKPOINT_LEN_1: @@ -271,21 +297,6 @@ static int arch_build_bp_info(struct perf_event *bp) return -EINVAL; } - /* Type */ - switch (bp->attr.bp_type) { - case HW_BREAKPOINT_W: - info->type = X86_BREAKPOINT_WRITE; - break; - case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - info->type = X86_BREAKPOINT_RW; - break; - case HW_BREAKPOINT_X: - info->type = X86_BREAKPOINT_EXECUTE; - break; - default: - return -EINVAL; - } - return 0; } /* @@ -305,6 +316,9 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) ret = -EINVAL; switch (info->len) { + case X86_BREAKPOINT_LEN_X: + align = sizeof(long) -1; + break; case X86_BREAKPOINT_LEN_1: align = 0; break; -- cgit v1.2.1 From b71ab8c2025caef8db719aa41af0ed735dc543cd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 29 Jun 2010 10:07:14 +0200 Subject: workqueue: increase max_active of keventd and kill current_is_keventd() Define WQ_MAX_ACTIVE and create keventd with max_active set to half of it which means that keventd now can process upto WQ_MAX_ACTIVE / 2 - 1 works concurrently. Unless some combination can result in dependency loop longer than max_active, deadlock won't happen and thus it's unnecessary to check whether current_is_keventd() before trying to schedule a work. Kill current_is_keventd(). (Lockdep annotations are broken. We need lock_map_acquire_read_norecurse()) Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Christoph Lameter Cc: Tony Luck Cc: Andi Kleen Cc: Oleg Nesterov --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c4f33b2e77d6..4d90f376e985 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -735,7 +735,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) goto do_rest; } - if (!keventd_up() || current_is_keventd()) + if (!keventd_up()) c_idle.work.func(&c_idle.work); else { schedule_work(&c_idle.work); -- cgit v1.2.1 From 567a9fd86735ccdc897768ed2dacdd5e83a13509 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 29 Jun 2010 14:53:50 +0900 Subject: kprobes/x86: Fix kprobes to skip prefixes correctly Fix resume_execution() and is_IF_modifier() to skip x86 instruction prefixes correctly by using x86 instruction attribute. Without this fix, resume_execution() can't handle instructions which have non-REX prefixes (REX prefixes are skipped). This will cause unexpected kernel panic by hitting bad address when a kprobe hits on two-byte ret (e.g. "repz ret" generated for Athlon/K8 optimization), because it just checks "repz" and can't recognize the "ret" instruction. These prefixes can be found easily with x86 instruction attribute. This patch introduces skip_prefixes() and uses it in resume_execution() and is_IF_modifier() to skip prefixes. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli LKML-Reference: <4C298A6E.8070609@hitachi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 345a4b1fe144..175f85ceace3 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to) } /* - * Check for the REX prefix which can only exist on X86_64 - * X86_32 always returns 0 + * Skip the prefixes of the instruction. */ -static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) +static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + while (inat_is_legacy_prefix(attr)) { + insn++; + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + } #ifdef CONFIG_X86_64 - if ((*insn & 0xf0) == 0x40) - return 1; + if (inat_is_rex_prefix(attr)) + insn++; #endif - return 0; + return insn; } /* @@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr) */ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) { + /* Skip prefixes */ + insn = skip_prefixes(insn); + switch (*insn) { case 0xfa: /* cli */ case 0xfb: /* sti */ @@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) return 1; } - /* - * on X86_64, 0x40-0x4f are REX prefixes so we need to look - * at the next byte instead.. but of course not recurse infinitely - */ - if (is_REX_prefix(insn)) - return is_IF_modifier(++insn); - return 0; } @@ -803,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p, unsigned long orig_ip = (unsigned long)p->addr; kprobe_opcode_t *insn = p->ainsn.insn; - /*skip the REX prefix*/ - if (is_REX_prefix(insn)) - insn++; + /* Skip prefixes */ + insn = skip_prefixes(insn); regs->flags &= ~X86_EFLAGS_TF; switch (*insn) { -- cgit v1.2.1 From 39ef13a4ac28aa64cfe1bc36e6e00f1096707a28 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 5 Jul 2010 10:09:29 +0800 Subject: perf, x86: P4 PMU -- redesign cache events To support cache events we have reserved the low 6 bits in hw_perf_event::config (which is a part of CCCR register configuration actually). These bits represent Replay Event mertic enumerated in enum P4_PEBS_METRIC. The caller should not care about which exact bits should be set and how -- the caller just chooses one P4_PEBS_METRIC entity and puts it into the config. The kernel will track it and set appropriate additional MSR registers (metrics) when needed. The reason for this redesign was the PEBS enable bit, which should not be set until DS (and PEBS sampling) support will be implemented properly. TODO ==== - PEBS sampling (note it's tricky and works with _one_ counter only so for HT machines it will be not that easy to handle both threads) - tracking of PEBS registers state, a user might need to turn PEBS off completely (ie no PEBS enable, no UOP_tag) but some other event may need it, such events clashes and should not run simultaneously, at moment we just don't support such events - eventually export user space bits in separate header which will allow user apps to configure raw events more conveniently. Signed-off-by: Cyrill Gorcunov Signed-off-by: Lin Ming Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Frederic Weisbecker LKML-Reference: <1278295769.9540.15.camel@minggr.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 147 +++++++++++++++++++++++++++--------- 1 file changed, 111 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 9286e736a70a..107711bf0ee8 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -21,22 +21,36 @@ struct p4_event_bind { char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ }; -struct p4_cache_event_bind { +struct p4_pebs_bind { unsigned int metric_pebs; unsigned int metric_vert; }; -#define P4_GEN_CACHE_EVENT_BIND(name) \ - [P4_CACHE__##name] = { \ - .metric_pebs = P4_PEBS__##name, \ - .metric_vert = P4_VERT__##name, \ +/* it sets P4_PEBS_ENABLE_UOP_TAG as well */ +#define P4_GEN_PEBS_BIND(name, pebs, vert) \ + [P4_PEBS_METRIC__##name] = { \ + .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \ + .metric_vert = vert, \ } -static struct p4_cache_event_bind p4_cache_event_bind_map[] = { - P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired), - P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired), - P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired), - P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired), +/* + * note we have P4_PEBS_ENABLE_UOP_TAG always set here + * + * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of + * event configuration to find out which values are to be + * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT + * resgisters + */ +static struct p4_pebs_bind p4_pebs_bind_map[] = { + P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), + P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002), + P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003), + P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010), + P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001), + P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001), + P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002), }; /* @@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = { }, }; -#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \ +#define P4_GEN_CACHE_EVENT(event, bit, metric) \ p4_config_pack_escr(P4_ESCR_EVENT(event) | \ P4_ESCR_EMASK_BIT(event, bit)) | \ - p4_config_pack_cccr(cache_event | \ + p4_config_pack_cccr(metric | \ P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) static __initconst const u64 p4_hw_cache_event_ids @@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__1stl_cache_load_miss_retired), + P4_PEBS_METRIC__1stl_cache_load_miss_retired), }, }, [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__2ndl_cache_load_miss_retired), + P4_PEBS_METRIC__2ndl_cache_load_miss_retired), }, }, [ C(DTLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__dtlb_load_miss_retired), + P4_PEBS_METRIC__dtlb_load_miss_retired), }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x0, [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, - P4_CACHE__dtlb_store_miss_retired), + P4_PEBS_METRIC__dtlb_store_miss_retired), }, }, [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, - P4_CACHE__itlb_reference_hit), + P4_PEBS_METRIC__none), [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, - P4_CACHE__itlb_reference_miss), + P4_PEBS_METRIC__none), }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, @@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event) return config; } +static int p4_validate_raw_event(struct perf_event *event) +{ + unsigned int v; + + /* user data may have out-of-bound event index */ + v = p4_config_unpack_event(event->attr.config); + if (v >= ARRAY_SIZE(p4_event_bind_map)) { + pr_warning("P4 PMU: Unknown event code: %d\n", v); + return -EINVAL; + } + + /* + * it may have some screwed PEBS bits + */ + if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { + pr_warning("P4 PMU: PEBS are not supported yet\n"); + return -EINVAL; + } + v = p4_config_unpack_metric(event->attr.config); + if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { + pr_warning("P4 PMU: Unknown metric code: %d\n", v); + return -EINVAL; + } + + return 0; +} + static int p4_hw_config(struct perf_event *event) { int cpu = get_cpu(); int rc = 0; - unsigned int evnt; u32 escr, cccr; /* @@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event) if (event->attr.type == PERF_TYPE_RAW) { - /* user data may have out-of-bound event index */ - evnt = p4_config_unpack_event(event->attr.config); - if (evnt >= ARRAY_SIZE(p4_event_bind_map)) { - rc = -EINVAL; + rc = p4_validate_raw_event(event); + if (rc) goto out; - } /* * We don't control raw events so it's up to the caller @@ -451,12 +488,15 @@ static int p4_hw_config(struct perf_event *event) * on HT machine but allow HT-compatible specifics to be * passed on) * + * Note that for RAW events we allow user to use P4_CCCR_RESERVED + * bits since we keep additional info here (for cache events and etc) + * * XXX: HT wide things should check perf_paranoid_cpu() && * CAP_SYS_ADMIN */ event->hw.config |= event->attr.config & (p4_config_pack_escr(P4_ESCR_MASK_HT) | - p4_config_pack_cccr(P4_CCCR_MASK_HT)); + p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); } rc = x86_setup_perfctr(event); @@ -482,6 +522,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) return overflow; } +static void p4_pmu_disable_pebs(void) +{ + /* + * FIXME + * + * It's still allowed that two threads setup same cache + * events so we can't simply clear metrics until we knew + * noone is depending on us, so we need kind of counter + * for "ReplayEvent" users. + * + * What is more complex -- RAW events, if user (for some + * reason) will pass some cache event metric with improper + * event opcode -- it's fine from hardware point of view + * but completely nonsence from "meaning" of such action. + * + * So at moment let leave metrics turned on forever -- it's + * ok for now but need to be revisited! + * + * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); + * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); + */ +} + static inline void p4_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -507,6 +570,26 @@ static void p4_pmu_disable_all(void) continue; p4_pmu_disable_event(event); } + + p4_pmu_disable_pebs(); +} + +/* configuration must be valid */ +static void p4_pmu_enable_pebs(u64 config) +{ + struct p4_pebs_bind *bind; + unsigned int idx; + + BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK); + + idx = p4_config_unpack_metric(config); + if (idx == P4_PEBS_METRIC__none) + return; + + bind = &p4_pebs_bind_map[idx]; + + (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } static void p4_pmu_enable_event(struct perf_event *event) @@ -515,9 +598,7 @@ static void p4_pmu_enable_event(struct perf_event *event) int thread = p4_ht_config_thread(hwc->config); u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); unsigned int idx = p4_config_unpack_event(hwc->config); - unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config); struct p4_event_bind *bind; - struct p4_cache_event_bind *bind_cache; u64 escr_addr, cccr; bind = &p4_event_bind_map[idx]; @@ -537,16 +618,10 @@ static void p4_pmu_enable_event(struct perf_event *event) cccr = p4_config_unpack_cccr(hwc->config); /* - * it could be Cache event so that we need to - * set metrics into additional MSRs + * it could be Cache event so we need to write metrics + * into additional MSRs */ - BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK); - if (idx_cache > P4_CACHE__NONE && - idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) { - bind_cache = &p4_cache_event_bind_map[idx_cache]; - (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs); - (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert); - } + p4_pmu_enable_pebs(hwc->config); (void)checking_wrmsrl(escr_addr, escr_conf); (void)checking_wrmsrl(hwc->config_base + hwc->idx, -- cgit v1.2.1 From 8e221b6db4477643fefc885a97ea9889ac733140 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 22 Jun 2010 16:23:37 -0700 Subject: x86: Avoid unnecessary __clear_user() and xrstor in signal handling fxsave/xsave doesn't touch all the bytes in the memory layout used by these instructions. Specifically SW reserved (bytes 464..511) fields in the fxsave frame and the reserved fields in the xsave header. To present a clean context for the signal handling, just clear these fields instead of clearing the complete fxsave/xsave memory layout, when we dump these registers directly to the user signal frame. Also avoid the call to second xrstor (which inits the state not passed in the signal frame) in restore_user_xstate() if all the state has already been restored by the first xrstor. These changes improve the performance of signal handling(by ~3-5% as measured by the lat_sig). Signed-off-by: Suresh Siddha LKML-Reference: <1277249017.2847.85.camel@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 37e68fc5e24a..6e73db1b7b4e 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -91,14 +91,6 @@ int save_i387_xstate(void __user *buf) return 0; if (task_thread_info(tsk)->status & TS_USEDFPU) { - /* - * Start with clearing the user buffer. This will present a - * clean context for the bytes not touched by the fxsave/xsave. - */ - err = __clear_user(buf, sig_xstate_size); - if (err) - return err; - if (use_xsave()) err = xsave_user(buf); else @@ -184,8 +176,8 @@ static int restore_user_xstate(void __user *buf) * init the state skipped by the user. */ mask = pcntxt_mask & ~mask; - - xrstor_state(init_xstate_buf, mask); + if (unlikely(mask)) + xrstor_state(init_xstate_buf, mask); return 0; -- cgit v1.2.1 From 83a7a2ad2a9173dcabc05df0f01d1d85b7ba1c2c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 10 Jun 2010 00:10:43 +0000 Subject: x86, alternatives: Use 16-bit numbers for cpufeature index We already have cpufeature indicies above 255, so use a 16-bit number for the alternatives index. This consumes a padding field and so doesn't add any size, but it means that abusing the padding field to create assembly errors on overflow no longer works. We can retain the test simply by redirecting it to the .discard section, however. [ v3: updated to include open-coded locations ] Signed-off-by: H. Peter Anvin LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141cf153..7862cf510ea9 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -914,7 +914,7 @@ ENTRY(simd_coprocessor_error) .balign 4 .long 661b .long 663f - .byte X86_FEATURE_XMM + .word X86_FEATURE_XMM .byte 662b-661b .byte 664f-663f .previous -- cgit v1.2.1 From bdc802dcca1709b01988d57e91f9f35ce1609fcc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 7 Jul 2010 17:29:18 -0700 Subject: x86, cpu: Support the features flags in new CPUID leaf 7 Intel has defined CPUID leaf 7 as the next set of feature flags (see the AVX specification, version 007). Add support for this new feature flags word. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/kernel/cpu/common.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 68e4a6f2211e..c7358303d8cd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -551,6 +551,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[4] = excap; } + /* Additional Intel-defined flags: level 0x00000007 */ + if (c->cpuid_level >= 0x00000007) { + u32 eax, ebx, ecx, edx; + + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + + if (eax > 0) + c->x86_capability[9] = ebx; + } + /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; -- cgit v1.2.1 From 5bbd4a336c81d32df71642abf310cf3d0c98dc9b Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Wed, 7 Jul 2010 19:51:59 -0400 Subject: x86/apic/es7000_32: Remove unused variable In today's linux-next I got this compile warning: arch/x86/kernel/apic/es7000_32.c:132: warning: 'base' defined but not used Current patch solves the issue removing the unused variable. Signed-off-by: Javier Martinez Canillas Cc: Rakib Mullick Cc: Eric W. Biederman Cc: Cyrill Gorcunov Cc: Tejun Heo LKML-Reference: <1278546719.9020.4.camel@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 425e53a87feb..8593582d8022 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -129,7 +129,6 @@ int es7000_plat; * GSI override for ES7000 platforms. */ -static unsigned int base; static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { -- cgit v1.2.1 From 9279aa55061a280b826bdf9ba5ab5f6a566c1dfb Mon Sep 17 00:00:00 2001 From: Ky Srinivasan Date: Mon, 28 Jun 2010 08:48:55 -0600 Subject: x86: Export the symbol ms_hyperv This is needed so that the staging hyperv can properly access this symbol. Signed-off-by: K. Y. Srinivasan Acked-by: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mshyperv.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 16f41bbe46b6..d944bf6c50e9 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -18,6 +18,7 @@ #include struct ms_hyperv_info ms_hyperv; +EXPORT_SYMBOL_GPL(ms_hyperv); static bool __init ms_hyperv_platform(void) { -- cgit v1.2.1 From 3b770a2128423a687e6e9c57184a584fb4ba4c77 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 13 Jul 2010 14:57:50 -0700 Subject: x86, alternatives: BUG on encountering an invalid CPU feature number Make the alternatives-patching code BUG on encountering an invalid CPU feature number. Should have done this a long time ago. Signed-off-by: H. Peter Anvin Cc: Yinghai Lu Cc: Suresh Siddha LKML-Reference: --- arch/x86/kernel/alternative.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 70237732a6c7..f65ab8b014c4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 *instr = a->instr; BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); + BUG_ON(a->cpuid >= NCAPINTS*32); if (!boot_cpu_has(a->cpuid)) continue; #ifdef CONFIG_X86_64 -- cgit v1.2.1 From b2691085d1f3ccce641dcfdd02722ba5d34db6ba Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 28 Jun 2010 16:46:48 -0700 Subject: x86: Clean up arch/x86/kernel/cpu/mtrr/cleanup.c: use ";" not "," to terminate statements Also needed if pr_ becomes a bit more space efficient. Signed-off-by: Joe Perches Acked-by: Thomas Gleixner Cc: "H. Peter Anvin" LKML-Reference: <1277768808.29157.280.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 06130b52f012..c5f59d071425 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i) unsigned long gran_base, chunk_base, lose_base; char gran_factor, chunk_factor, lose_factor; - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor); + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor); + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor); pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", result[i].bad ? "*BAD*" : " ", -- cgit v1.2.1 From f33ebbe9da2c3c24664a0ad4f8fd83f293547e63 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 6 May 2010 20:17:00 +0200 Subject: unistd: add __NR_prlimit64 syscall numbers Add __NR_prlimit64 syscall numbers to asm-generic. Add them also to asm-x86, both 32 and 64-bit. Signed-off-by: Jiri Slaby Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b3729341216..eca1d7d23ab5 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -337,3 +337,4 @@ ENTRY(sys_call_table) .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open .long sys_recvmmsg + .long sys_prlimit64 -- cgit v1.2.1 From 93a7ca0c3ebe5d931126f1fb732cb9c4518383d4 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 16 Jul 2010 10:11:21 -0500 Subject: x86, UV: Initialize BAU MMRs only on hubs with cpus Remove the initialization of MMRs UVH_LB_BAU_SB_ACTIVATION_CONTROL and UVH_BAU_DATA_BROADCAST on UV hubs that have no active cpus. Such initialization on hubs with no active cpus would result in a kernel page fault. This is not of real high priority, because we don't have any such systems (with UV hubs that have no active cpus). But they will be coming. Signed-off-by: Cliff Wickman LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index abf3c31f14cf..59efb5390b37 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -1635,12 +1635,16 @@ static int __init uv_bau_init(void) alloc_intr_gate(vector, uv_bau_message_intr1); for_each_possible_blade(uvhub) { - pnode = uv_blade_to_pnode(uvhub); - /* INIT the bau */ - uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, - ((unsigned long)1 << 63)); - mmr = 1; /* should be 1 to broadcast to both sockets */ - uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr); + if (uv_blade_nr_possible_cpus(uvhub)) { + pnode = uv_blade_to_pnode(uvhub); + /* INIT the bau */ + uv_write_global_mmr64(pnode, + UVH_LB_BAU_SB_ACTIVATION_CONTROL, + ((unsigned long)1 << 63)); + mmr = 1; /* should be 1 to broadcast to both sockets */ + uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, + mmr); + } } return 0; -- cgit v1.2.1 From a2531293dbb7608fa672ff28efe3ab4027917a2f Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Sun, 18 Jul 2010 14:27:13 +0200 Subject: update email address pavel@suse.cz no longer works, replace it with working address. Signed-off-by: Pavel Machek Signed-off-by: Jiri Kosina --- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/apm_32.c | 2 +- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 82e508677b91..f51cc55aced6 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -2,7 +2,7 @@ * sleep.c - x86-specific ACPI sleep support. * * Copyright (C) 2001-2003 Patrick Mochel - * Copyright (C) 2001-2003 Pavel Machek + * Copyright (C) 2001-2003 Pavel Machek */ #include diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index c4f9182ca3ac..4c9c67bf09b7 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -140,7 +140,7 @@ * is now the way life works). * Fix thinko in suspend() (wrong return). * Notify drivers on critical suspend. - * Make kapmd absorb more idle time (Pavel Machek + * Make kapmd absorb more idle time (Pavel Machek * modified by sfr). * Disable interrupts while we are suspended (Andy Henroid * fixed by sfr). diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 7ec2123838e6..0af9aa20fce1 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -9,7 +9,7 @@ * Based on the powernow-k7.c module written by Dave Jones. * (C) 2003 Dave Jones on behalf of SuSE Labs * (C) 2004 Dominik Brodowski - * (C) 2004 Pavel Machek + * (C) 2004 Pavel Machek * Licensed under the terms of the GNU GPL License version 2. * Based upon datasheets & sample CPUs kindly provided by AMD. * -- cgit v1.2.1 From 6c54aabd5e687092557f4881ce2d4013b971f293 Mon Sep 17 00:00:00 2001 From: Kulikov Vasiliy Date: Sat, 3 Jul 2010 12:03:51 -0400 Subject: x86/amd-iommu: Use for_each_pci_dev() Use for_each_pci_dev() to simplify the code. Signed-off-by: Kulikov Vasiliy Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0d20286d78c6..29dd3b9f2f09 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2609,8 +2609,7 @@ int __init amd_iommu_init_passthrough(void) pt_domain->mode |= PAGE_MODE_NONE; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - + for_each_pci_dev(dev) { if (!check_device(&dev->dev)) continue; -- cgit v1.2.1 From edb18f8ab02843453306601c4aa697f9691129cd Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:50 -0700 Subject: x86, cpu: Make init_scattered_cpuid_features() consider cpuid subleaves Some cpuid features (like xsaveopt) are enumerated using cpuid subleaves. Extend init_scattered_cpuid_features() to take subleaf into account. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.439900717@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/addon_cpuid_features.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 7369b4c2c55a..03cf24a3d933 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -14,6 +14,7 @@ struct cpuid_bit { u8 reg; u8 bit; u32 level; + u32 sub_leaf; }; enum cpuid_regs { @@ -30,16 +31,16 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, - { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 }, - { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 }, - { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, - { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, - { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, - { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, - { 0, 0, 0, 0 } + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, + { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { 0, 0, 0, 0, 0 } }; for (cb = cpuid_bits; cb->feature; cb++) { @@ -50,8 +51,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) max_level > (cb->level | 0xffff)) continue; - cpuid(cb->level, ®s[CR_EAX], ®s[CR_EBX], - ®s[CR_ECX], ®s[CR_EDX]); + cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], + ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); if (regs[cb->reg] & (1 << cb->bit)) set_cpu_cap(c, cb->feature); -- cgit v1.2.1 From 5734f62b6601d88fd8ec720cb56b93fd3a030557 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:52 -0700 Subject: x86, cpu: Enumerate xsaveopt Enumerate the xsaveopt feature. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.604014179@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/addon_cpuid_features.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 03cf24a3d933..41eebcd90fce 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -35,6 +35,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, -- cgit v1.2.1 From a1488f8bf4d72ad724700f6e982469a1240e4264 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:48 -0700 Subject: x86, xsave: Track the offset, size of state in the xsave layout Subleaves of the cpuid vector 0xd provides the offset and size of different feature state that are managed by the xsave/xrstor. Track this for the upcoming usage during signal handling. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.262987929@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 980149867a19..4993caa4181c 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -21,6 +21,8 @@ struct _fpx_sw_bytes fx_sw_reserved; struct _fpx_sw_bytes fx_sw_reserved_ia32; #endif +static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; + /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. @@ -301,6 +303,31 @@ void __cpuinit xsave_init(void) xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } +/* + * Record the offsets and sizes of different state managed by the xsave + * memory layout. + */ +static void setup_xstate_features(void) +{ + int eax, ebx, ecx, edx, leaf = 0x2; + + xstate_features = fls64(pcntxt_mask); + xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); + xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); + + do { + cpuid_count(0xd, leaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0) + break; + + xstate_offsets[leaf] = ebx; + xstate_sizes[leaf] = eax; + + leaf++; + } while (1); +} + /* * setup the xstate image representing the init state */ @@ -308,6 +335,8 @@ static void __init setup_xstate_init(void) { init_xstate_buf = alloc_bootmem(xstate_size); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; + + setup_xstate_features(); } /* -- cgit v1.2.1 From 29104e101d710dd152f807978884643a52eca8b7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:49 -0700 Subject: x86, xsave: Sync xsave memory layout with its header for user handling With xsaveopt, if a processor implementation discern that a processor state component is in its initialized state it may modify the corresponding bit in the xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory layout. Hence wHile presenting the xstate information to the user, we always ensure that the memory layout of a feature will be in the init state if the corresponding header bit is zero. This ensures the consistency and avoids the condition of the user seeing some some stale state in the memory layout during signal handling, debugging etc. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.351459480@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/i387.c | 11 ++++++ arch/x86/kernel/xsave.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 99 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 86cef6b32253..6106af9fd129 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -190,6 +190,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.fpu.state->fxsave, 0, -1); } @@ -207,6 +209,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.fpu.state->fxsave, 0, -1); @@ -446,6 +450,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, -1); } + sanitize_i387_state(target); + if (kbuf && pos == 0 && count == sizeof(env)) { convert_from_fxsr(kbuf, target); return 0; @@ -467,6 +473,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); @@ -533,6 +541,9 @@ static int save_i387_xsave(void __user *buf) struct _fpstate_ia32 __user *fx = buf; int err = 0; + + sanitize_i387_state(tsk); + /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 4993caa4181c..368047c8d507 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -23,6 +23,76 @@ struct _fpx_sw_bytes fx_sw_reserved_ia32; static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; +/* + * If a processor implementation discern that a processor state component is + * in its initialized state it may modify the corresponding bit in the + * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory + * layout in the case of xsaveopt. While presenting the xstate information to + * the user, we always ensure that the memory layout of a feature will be in + * the init state if the corresponding header bit is zero. This is to ensure + * that the user doesn't see some stale state in the memory layout during + * signal handling, debugging etc. + */ +void __sanitize_i387_state(struct task_struct *tsk) +{ + u64 xstate_bv; + int feature_bit = 0x2; + struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; + + if (!fx) + return; + + BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); + + xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; + + /* + * None of the feature bits are in init state. So nothing else + * to do for us, as the memory layout is upto date. + */ + if ((xstate_bv & pcntxt_mask) == pcntxt_mask) + return; + + /* + * FP is in init state + */ + if (!(xstate_bv & XSTATE_FP)) { + fx->cwd = 0x37f; + fx->swd = 0; + fx->twd = 0; + fx->fop = 0; + fx->rip = 0; + fx->rdp = 0; + memset(&fx->st_space[0], 0, 128); + } + + /* + * SSE is in init state + */ + if (!(xstate_bv & XSTATE_SSE)) + memset(&fx->xmm_space[0], 0, 256); + + xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; + + /* + * Update all the other memory layouts for which the corresponding + * header bit is in the init state. + */ + while (xstate_bv) { + if (xstate_bv & 0x1) { + int offset = xstate_offsets[feature_bit]; + int size = xstate_sizes[feature_bit]; + + memcpy(((void *) fx) + offset, + ((void *) init_xstate_buf) + offset, + size); + } + + xstate_bv >>= 1; + feature_bit++; + } +} + /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. @@ -112,6 +182,7 @@ int save_i387_xstate(void __user *buf) task_thread_info(tsk)->status &= ~TS_USEDFPU; stts(); } else { + sanitize_i387_state(tsk); if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, xstate_size)) return -1; @@ -333,10 +404,26 @@ static void setup_xstate_features(void) */ static void __init setup_xstate_init(void) { + setup_xstate_features(); + + /* + * Setup init_xstate_buf to represent the init state of + * all the features managed by the xsave + */ init_xstate_buf = alloc_bootmem(xstate_size); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; - setup_xstate_features(); + clts(); + /* + * Init all the features state with header_bv being 0x0 + */ + xrstor_state(init_xstate_buf, -1); + /* + * Dump the init state again. This is to identify the init state + * of any feature which is not represented by all zero's. + */ + xsave_state(init_xstate_buf, -1); + stts(); } /* -- cgit v1.2.1 From 6bad06b768920e278c7cedfdda56a0b4c6a35ee9 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 19 Jul 2010 16:05:52 -0700 Subject: x86, xsave: Use xsaveopt in context-switch path when supported xsaveopt is a more optimized form of xsave specifically designed for the context switch usage. xsaveopt doesn't save the state that's not modified from the prior xrstor. And if a specific feature state gets modified to the init state, then xsaveopt just updates the header bit in the xsave memory layout without updating the corresponding memory layout. Signed-off-by: Suresh Siddha LKML-Reference: <20100719230205.604014179@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c7358303d8cd..3f715efc594d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); return 1; } __setup("noxsave", x86_xsave_setup); +static int __init x86_xsaveopt_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + return 1; +} +__setup("noxsaveopt", x86_xsaveopt_setup); + #ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; -- cgit v1.2.1 From 2decb194e65ab66eaf787512dc572cdc99893b24 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 19 Jul 2010 18:32:04 -0700 Subject: x86, cpu: Split addon_cpuid_features.c addon_cpuid_features.c contains exactly two almost completely unrelated functions, plus has a long and very generic name. Split it into two files, scattered.c for the scattered feature flags, and topology.c for the topology information. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/addon_cpuid_features.c | 150 ----------------------------- arch/x86/kernel/cpu/scattered.c | 61 ++++++++++++ arch/x86/kernel/cpu/topology.c | 99 +++++++++++++++++++ 4 files changed, 161 insertions(+), 151 deletions(-) delete mode 100644 arch/x86/kernel/cpu/addon_cpuid_features.c create mode 100644 arch/x86/kernel/cpu/scattered.c create mode 100644 arch/x86/kernel/cpu/topology.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3a785da34b6f..5e3a3512ba05 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,7 +12,7 @@ endif nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) -obj-y := intel_cacheinfo.o addon_cpuid_features.o +obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c deleted file mode 100644 index 41eebcd90fce..000000000000 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Routines to indentify additional cpu features that are scattered in - * cpuid space. - */ -#include - -#include -#include - -#include - -struct cpuid_bit { - u16 feature; - u8 reg; - u8 bit; - u32 level; - u32 sub_leaf; -}; - -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX -}; - -void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) -{ - u32 max_level; - u32 regs[4]; - const struct cpuid_bit *cb; - - static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, - { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, - { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, - { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, - { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, - { 0, 0, 0, 0, 0 } - }; - - for (cb = cpuid_bits; cb->feature; cb++) { - - /* Verify that the level is valid */ - max_level = cpuid_eax(cb->level & 0xffff0000); - if (max_level < cb->level || - max_level > (cb->level | 0xffff)) - continue; - - cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], - ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); - - if (regs[cb->reg] & (1 << cb->bit)) - set_cpu_cap(c, cb->feature); - } -} - -/* leaf 0xb SMT level */ -#define SMT_LEVEL 0 - -/* leaf 0xb sub-leaf types */ -#define INVALID_TYPE 0 -#define SMT_TYPE 1 -#define CORE_TYPE 2 - -#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) -#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) -#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) - -/* - * Check for extended topology enumeration cpuid leaf 0xb and if it - * exists, use it for populating initial_apicid and cpu topology - * detection. - */ -void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width; - unsigned int core_select_mask, core_level_siblings; - static bool printed; - - if (c->cpuid_level < 0xb) - return; - - cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); - - /* - * check if the cpuid leaf 0xb is actually implemented. - */ - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) - return; - - set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); - - /* - * initial apic id, which also represents 32-bit extended x2apic id. - */ - c->initial_apicid = edx; - - /* - * Populate HT related information from sub-leaf level 0. - */ - core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); - core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - - sub_index = 1; - do { - cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); - - /* - * Check for the Core type in the implemented sub leaves. - */ - if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { - core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - break; - } - - sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); - - core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; - - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) - & core_select_mask; - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width); - /* - * Reinit the apicid, now that we have extended initial_apicid. - */ - c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); - - c->x86_max_cores = (core_level_siblings / smp_num_siblings); - - if (!printed) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - printed = 1; - } - return; -#endif -} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c new file mode 100644 index 000000000000..9815364b477e --- /dev/null +++ b/arch/x86/kernel/cpu/scattered.c @@ -0,0 +1,61 @@ +/* + * Routines to indentify additional cpu features that are scattered in + * cpuid space. + */ +#include + +#include +#include + +#include + +struct cpuid_bit { + u16 feature; + u8 reg; + u8 bit; + u32 level; + u32 sub_leaf; +}; + +enum cpuid_regs { + CR_EAX = 0, + CR_ECX, + CR_EDX, + CR_EBX +}; + +void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) +{ + u32 max_level; + u32 regs[4]; + const struct cpuid_bit *cb; + + static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, + { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, + { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { 0, 0, 0, 0, 0 } + }; + + for (cb = cpuid_bits; cb->feature; cb++) { + + /* Verify that the level is valid */ + max_level = cpuid_eax(cb->level & 0xffff0000); + if (max_level < cb->level || + max_level > (cb->level | 0xffff)) + continue; + + cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], + ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); + + if (regs[cb->reg] & (1 << cb->bit)) + set_cpu_cap(c, cb->feature); + } +} diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c new file mode 100644 index 000000000000..4397e987a1cf --- /dev/null +++ b/arch/x86/kernel/cpu/topology.c @@ -0,0 +1,99 @@ +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ + +#include +#include +#include +#include + +/* leaf 0xb SMT level */ +#define SMT_LEVEL 0 + +/* leaf 0xb sub-leaf types */ +#define INVALID_TYPE 0 +#define SMT_TYPE 1 +#define CORE_TYPE 2 + +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) +#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) + +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + static bool printed; + + if (c->cpuid_level < 0xb) + return; + + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + + /* + * check if the cpuid leaf 0xb is actually implemented. + */ + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) + return; + + set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + + /* + * initial apic id, which also represents 32-bit extended x2apic id. + */ + c->initial_apicid = edx; + + /* + * Populate HT related information from sub-leaf level 0. + */ + core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + + sub_index = 1; + do { + cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + + /* + * Check for the Core type in the implemented sub leaves. + */ + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + + core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) + & core_select_mask; + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + + c->x86_max_cores = (core_level_siblings / smp_num_siblings); + + if (!printed) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + printed = 1; + } + return; +#endif +} -- cgit v1.2.1 From fa10ba64ac94fec4611b79804023eb087862ffe0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 20 Jul 2010 15:19:49 -0700 Subject: x86, gcc-4.6: Fix set but not read variables Just some dead code, no real bugs. Found by gcc 4.6 -Wall Signed-off-by: Andi Kleen LKML-Reference: <201007202219.o6KMJnQ0021072@imap1.linux-foundation.org> Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/kernel/aperture_64.c | 4 ++-- arch/x86/kernel/cpu/mtrr/generic.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b5d8b0bcf235..a2e0caf26e17 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void) * or BIOS forget to put that in reserved. * try to update e820 to make that region as reserved. */ - u32 agp_aper_base = 0, agp_aper_order = 0; + u32 agp_aper_order = 0; int i, fix, slot, valid_agp = 0; u32 ctl; u32 aper_size = 0, aper_order = 0, last_aper_order = 0; @@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void) return; /* This is mostly duplicate of iommu_hole_init */ - agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + search_agp_bridge(&agp_aper_order, &valid_agp); fix = 0; for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fd31a441c61c..7d28d7d03885 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, { unsigned int mask_lo, mask_hi, base_lo, base_hi; unsigned int tmp, hi; - int cpu; /* * get_mtrr doesn't need to update mtrr_state, also it could be called * from any cpu, so try to print it out directly. */ - cpu = get_cpu(); + get_cpu(); rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); -- cgit v1.2.1 From db10db48b2c530def21bfd76d576702c7df7f620 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 20 Jul 2010 20:50:49 +0200 Subject: x86, xsave: 32/64 bit boot cpu check unification in initialization Boot cpu id is always 0, thus simplifying and unifying boot cpu check. boot_cpu_id is there for historical reasons and was renamed to boot_cpu_physical_apicid in patch: c70dcb7 x86: change boot_cpu_id to boot_cpu_physical_apicid However, there are some remaining occurrences of boot_cpu_id that are never touched in the kernel and thus its value is always 0. Signed-off-by: Robert Richter LKML-Reference: <1279651857-24639-3-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3f715efc594d..26804b2986b8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1273,7 +1273,7 @@ void __cpuinit cpu_init(void) /* * Boot processor to setup the FP and extended state context info. */ - if (smp_processor_id() == boot_cpu_id) + if (!smp_processor_id()) init_thread_xstate(); xsave_init(); -- cgit v1.2.1 From 82d4150cec83b9775f84810b39a1c0b91585d429 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 20 Jul 2010 20:50:51 +0200 Subject: x86, xsave: Move boot cpu initialization to xsave_init() This patch moves boot cpu initialization to xsave_init(). Now all cpus are initialized in one single function. Signed-off-by: Robert Richter LKML-Reference: <1279651857-24639-5-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 6 ------ arch/x86/kernel/i387.c | 5 ----- arch/x86/kernel/xsave.c | 14 ++++++++++++-- 3 files changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 26804b2986b8..40561085d4f3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1270,12 +1270,6 @@ void __cpuinit cpu_init(void) clear_used_math(); mxcsr_feature_mask_init(); - /* - * Boot processor to setup the FP and extended state context info. - */ - if (!smp_processor_id()) - init_thread_xstate(); - xsave_init(); } #endif diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 6106af9fd129..2f32ef05f10e 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -93,11 +93,6 @@ void __cpuinit fpu_init(void) write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ - /* - * Boot processor to setup the FP and extended state context info. - */ - if (!smp_processor_id()) - init_thread_xstate(); xsave_init(); mxcsr_feature_mask_init(); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 368047c8d507..ab9ad48b6530 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -360,7 +360,7 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); /* * Enable the extended processor state save/restore feature */ -void __cpuinit xsave_init(void) +static void __cpuinit __xsave_init(void) { if (!cpu_has_xsave) return; @@ -446,7 +446,7 @@ void __ref xsave_cntxt_init(void) * Support only the state known to OS. */ pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - xsave_init(); + __xsave_init(); /* * Recompute the context size for enabled features @@ -463,3 +463,13 @@ void __ref xsave_cntxt_init(void) "cntxt size 0x%x\n", pcntxt_mask, xstate_size); } + +void __cpuinit xsave_init(void) +{ + /* + * Boot processor to setup the FP and extended state context info. + */ + if (!smp_processor_id()) + init_thread_xstate(); + __xsave_init(); +} -- cgit v1.2.1 From 5edd19af18a36a4e22c570b1b969179e0ca1fe4c Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Tue, 20 Jul 2010 18:09:05 -0500 Subject: x86, UV: Make kdump avoid stack dumps UV NMI callback's should not write stack dumps when a kdump is to be written. When invoking the crash kernel to write a dump, kdump_nmi_shootdown_cpus() uses NMI's to get all the cpu's to save their register context and halt. But the NMI interrupt handler runs a callback list. This patch sets a flag to prevent any of those callbacks from interfering with the halt of the cpu. For UV, which currently has the only callback to which this is relevant, the uv_handle_nmi() callback should not do dumping of stacks. The 'in_crash_kexec' flag is defined as an extern in kdebug.h firstly because x2apic_uv_x.c includes it. Secondly because some future callback might need the flag to know that it should not enter the debugger. (Such a scenario was in fact present in the 2.6.32 kernel, SuSE distribution, where a call to kdb needed to be avoided.) Signed-off-by: Cliff Wickman LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/x2apic_uv_x.c | 4 ++++ arch/x86/kernel/crash.c | 3 +++ 2 files changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e46f98f36e31..7b598b84c902 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -604,6 +604,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) { if (reason != DIE_NMI_IPI) return NOTIFY_OK; + + if (in_crash_kexec) + /* do nothing if entering the crash kernel */ + return NOTIFY_OK; /* * Use a lock so only one cpu prints at a time * to prevent intermixed output. diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ebd4c51d096a..764c7c2b1811 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -28,6 +28,8 @@ #include #include +int in_crash_kexec; + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct die_args *args) @@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args) static void kdump_nmi_shootdown_cpus(void) { + in_crash_kexec = 1; nmi_shootdown_cpus(kdump_nmi_callback); disable_local_APIC(); -- cgit v1.2.1 From 0e49bf66d2ca649b167428adddbbbe9d9bd4894c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:52 +0200 Subject: x86, xsave: Separate fpu and xsave initialization As xsave also supports other than fpu features, it should be initialized independently of the fpu. This patch moves this out of fpu initialization. There is also a lot of cross referencing between fpu and xsave code. This patch reduces this by making xsave_cntxt_init() and init_thread_xstate() static functions. The patch moves the cpu_has_xsave check at the beginning of xsave_init(). All other checks may removed then. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-2-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 2 ++ arch/x86/kernel/i387.c | 27 +++++++++++++++++++-------- arch/x86/kernel/xsave.c | 10 +++++----- 3 files changed, 26 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 40561085d4f3..94c36c7ac183 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1210,6 +1210,7 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); + xsave_init(); raw_local_save_flags(kernel_eflags); @@ -1270,6 +1271,7 @@ void __cpuinit cpu_init(void) clear_used_math(); mxcsr_feature_mask_init(); + fpu_init(); xsave_init(); } #endif diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 2f32ef05f10e..e73c54ebafce 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void) stts(); } -void __cpuinit init_thread_xstate(void) +static void __cpuinit init_thread_xstate(void) { + /* + * Note that xstate_size might be overwriten later during + * xsave_init(). + */ + if (!HAVE_HWFP) { xstate_size = sizeof(struct i387_soft_struct); return; } - if (cpu_has_xsave) { - xsave_cntxt_init(); - return; - } - if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); #ifdef CONFIG_X86_32 @@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void) * Called at bootup to set up the initial FPU state that is later cloned * into all processes. */ + void __cpuinit fpu_init(void) { unsigned long oldcr0 = read_cr0(); @@ -93,14 +94,24 @@ void __cpuinit fpu_init(void) write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ - xsave_init(); + if (!smp_processor_id()) + init_thread_xstate(); mxcsr_feature_mask_init(); /* clean state in init */ current_thread_info()->status = 0; clear_used_math(); } -#endif /* CONFIG_X86_64 */ + +#else /* CONFIG_X86_64 */ + +void __cpuinit fpu_init(void) +{ + if (!smp_processor_id()) + init_thread_xstate(); +} + +#endif /* CONFIG_X86_32 */ static void fpu_finit(struct fpu *fpu) { diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index ab9ad48b6530..550bf45236f4 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -362,9 +362,6 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); */ static void __cpuinit __xsave_init(void) { - if (!cpu_has_xsave) - return; - set_in_cr4(X86_CR4_OSXSAVE); /* @@ -429,7 +426,7 @@ static void __init setup_xstate_init(void) /* * Enable and initialize the xsave feature. */ -void __ref xsave_cntxt_init(void) +static void __cpuinit xsave_cntxt_init(void) { unsigned int eax, ebx, ecx, edx; @@ -466,10 +463,13 @@ void __ref xsave_cntxt_init(void) void __cpuinit xsave_init(void) { + if (!cpu_has_xsave) + return; + /* * Boot processor to setup the FP and extended state context info. */ if (!smp_processor_id()) - init_thread_xstate(); + xsave_cntxt_init(); __xsave_init(); } -- cgit v1.2.1 From 97e80a70db689fb1e876df9f12305cc72f85ca53 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:53 +0200 Subject: x86, xsave: Introduce xstate enable functions The patch renames xsave_cntxt_init() and __xsave_init() into xstate_enable_boot_cpu() and xstate_enable() as this names are more meaningful. It also removes the duplicate xcr setup for the boot cpu. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-3-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 550bf45236f4..2322f586c051 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -360,15 +360,10 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); /* * Enable the extended processor state save/restore feature */ -static void __cpuinit __xsave_init(void) +static inline void xstate_enable(u64 mask) { set_in_cr4(X86_CR4_OSXSAVE); - - /* - * Enable all the features that the HW is capable of - * and the Linux kernel is aware of. - */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, mask); } /* @@ -426,7 +421,7 @@ static void __init setup_xstate_init(void) /* * Enable and initialize the xsave feature. */ -static void __cpuinit xsave_cntxt_init(void) +static void __cpuinit xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; @@ -443,7 +438,8 @@ static void __cpuinit xsave_cntxt_init(void) * Support only the state known to OS. */ pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - __xsave_init(); + + xstate_enable(pcntxt_mask); /* * Recompute the context size for enabled features @@ -470,6 +466,7 @@ void __cpuinit xsave_init(void) * Boot processor to setup the FP and extended state context info. */ if (!smp_processor_id()) - xsave_cntxt_init(); - __xsave_init(); + xstate_enable_boot_cpu(); + else + xstate_enable(pcntxt_mask); } -- cgit v1.2.1 From ee813d53a8e980a3a28318efb8935d45723f5211 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:54 +0200 Subject: x86, xsave: Check cpuid level for XSTATE_CPUID (0x0d) The patch introduces the XSTATE_CPUID macro and adds a check that tests if XSTATE_CPUID exists. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-4-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 2322f586c051..5adb7fb408f0 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -379,7 +379,7 @@ static void setup_xstate_features(void) xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); do { - cpuid_count(0xd, leaf, &eax, &ebx, &ecx, &edx); + cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); if (eax == 0) break; @@ -425,7 +425,12 @@ static void __cpuinit xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; - cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); + return; + } + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); pcntxt_mask = eax + ((u64)edx << 32); if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { @@ -444,7 +449,7 @@ static void __cpuinit xstate_enable_boot_cpu(void) /* * Recompute the context size for enabled features */ - cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); xstate_size = ebx; update_regset_xstate_info(xstate_size, pcntxt_mask); -- cgit v1.2.1 From 45c2d7f46211a0b1f6b425c59575c53145afc4b4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:55 +0200 Subject: x86, xsave: Make init_xstate_buf static The pointer is only used in xsave.c. Making it static. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-5-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 5adb7fb408f0..3b44a9b1eca4 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -16,6 +16,11 @@ */ u64 pcntxt_mask; +/* + * Represents init state for the supported extended state. + */ +static struct xsave_struct *init_xstate_buf; + struct _fpx_sw_bytes fx_sw_reserved; #ifdef CONFIG_IA32_EMULATION struct _fpx_sw_bytes fx_sw_reserved_ia32; @@ -348,11 +353,6 @@ static void prepare_fx_sw_frame(void) #endif } -/* - * Represents init state for the supported extended state. - */ -struct xsave_struct *init_xstate_buf; - #ifdef CONFIG_X86_64 unsigned int sig_xstate_size = sizeof(struct _fpstate); #endif -- cgit v1.2.1 From 4995b9dba908436c1611454f9bd2cb3ddf6babee Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 21 Jul 2010 19:03:56 +0200 Subject: x86, xsave: Add __init attribute to setup_xstate_features() This is called only from initialization code. Signed-off-by: Robert Richter LKML-Reference: <1279731838-1522-6-git-send-email-robert.richter@amd.com> Acked-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 3b44a9b1eca4..cfc7901ee94e 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -370,7 +370,7 @@ static inline void xstate_enable(u64 mask) * Record the offsets and sizes of different state managed by the xsave * memory layout. */ -static void setup_xstate_features(void) +static void __init setup_xstate_features(void) { int eax, ebx, ecx, edx, leaf = 0x2; -- cgit v1.2.1 From 1cff92d8fdb27684308864d9cdb324bee43b40ab Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 21 Jul 2010 14:23:10 -0700 Subject: x86, xsave: Make xstate_enable_boot_cpu() __init, protect on CPU 0 xstate_enable_boot_cpu() is, as the name implies, only used on the boot CPU; furthermore, it invokes alloc_bootmem(), which is __init; hence it needs to be tagged __init rather than __cpuinit. Furthermore, it is *not* safe in the long run to rely on CPU 0 only coming online during the early boot -- at some point we're going to support offlining (and re-onlining) the boot CPU, and at that point we must not call xstate_enable_boot_cpu() again. The code is a fair bit more obscure than one would like, because the __ref overrides aren't quite powerful enough. Signed-off-by: H. Peter Anvin Acked-by: Suresh Siddha Cc: Robert Richter LKML-Reference: <4C476236.1020302@zytor.com> --- arch/x86/kernel/xsave.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index cfc7901ee94e..b2549c3eb2c3 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -360,10 +360,10 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); /* * Enable the extended processor state save/restore feature */ -static inline void xstate_enable(u64 mask) +static inline void xstate_enable(void) { set_in_cr4(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } /* @@ -421,7 +421,7 @@ static void __init setup_xstate_init(void) /* * Enable and initialize the xsave feature. */ -static void __cpuinit xstate_enable_boot_cpu(void) +static void __init xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; @@ -444,7 +444,7 @@ static void __cpuinit xstate_enable_boot_cpu(void) */ pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - xstate_enable(pcntxt_mask); + xstate_enable(); /* * Recompute the context size for enabled features @@ -462,16 +462,22 @@ static void __cpuinit xstate_enable_boot_cpu(void) pcntxt_mask, xstate_size); } +/* + * For the very first instance, this calls xstate_enable_boot_cpu(); + * for all subsequent instances, this calls xstate_enable(). + * + * This is somewhat obfuscated due to the lack of powerful enough + * overrides for the section checks. + */ void __cpuinit xsave_init(void) { + static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; + void (*this_func)(void); + if (!cpu_has_xsave) return; - /* - * Boot processor to setup the FP and extended state context info. - */ - if (!smp_processor_id()) - xstate_enable_boot_cpu(); - else - xstate_enable(pcntxt_mask); + this_func = next_func; + next_func = xstate_enable; + this_func(); } -- cgit v1.2.1 From cfaa71ee9794472598d3966c3315cd6bd8f953d3 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 17 Jul 2010 09:03:27 -0400 Subject: x86: Use symbolic MSR names Use symbolic MSR names instead of hardcoding the MSR index. Signed-off-by: Brian Gerst LKML-Reference: <1279371808-24804-2-git-send-email-brgerst@gmail.com> Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/realmode/wakeup.S | 2 +- arch/x86/kernel/verify_cpu_64.S | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 580b4e296010..28595d6df47c 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -104,7 +104,7 @@ _start: movl %eax, %ecx orl %edx, %ecx jz 1f - movl $0xc0000080, %ecx + movl $MSR_EFER, %ecx wrmsr 1: diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S index 45b6f8a975a1..56a8c2a867d9 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu_64.S @@ -31,6 +31,7 @@ */ #include +#include verify_cpu: pushfl # Save caller passed flags @@ -88,7 +89,7 @@ verify_cpu_sse_test: je verify_cpu_sse_ok test %di,%di jz verify_cpu_no_longmode # only try to force SSE on AMD - movl $0xc0010015,%ecx # HWCR + movl $MSR_K7_HWCR,%ecx rdmsr btr $15,%eax # enable SSE wrmsr -- cgit v1.2.1 From 650fb4393dff543bc980d361555c489fbdeed088 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 17 Jul 2010 09:03:28 -0400 Subject: x86-64: Simplify loading initial_gs Load initial_gs as two 32-bit values instead of splitting a 64-bit value. Signed-off-by: Brian Gerst LKML-Reference: <1279371808-24804-3-git-send-email-brgerst@gmail.com> Reviewed-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_64.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3d1e6f16b7a6..239046bd447f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -234,9 +234,8 @@ ENTRY(secondary_startup_64) * init data section till per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq initial_gs(%rip),%rax - movq %rax,%rdx - shrq $32,%rdx + movl initial_gs(%rip),%eax + movl initial_gs+4(%rip),%edx wrmsr /* esi is pointer to real mode structure with interesting info. -- cgit v1.2.1 From 4c21adf26f8fcf86a755b9b9f55c2e9fd241e1fb Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 20 Jul 2010 16:59:34 -0700 Subject: x86 cpufreq, perf: Make trace_power_frequency cpufreq driver independent and fix the broken case if a core's frequency depends on others. trace_power_frequency was only implemented in a rather ungeneric way in acpi-cpufreq driver's target() function only. -> Move the call to trace_power_frequency to cpufreq.c:cpufreq_notify_transition() where CPUFREQ_POSTCHANGE notifier is triggered. This will support power frequency tracing by all cpufreq drivers. trace_power_frequency did not trace frequency changes correctly when the userspace governor was used or when CPU cores' frequency depend on each other. -> Moving this into the CPUFREQ_POSTCHANGE notifier and pass the cpu which gets switched automatically fixes this. Robert Schoene provided some important fixes on top of my initial quick shot version which are integrated in this patch: - Forgot some changes in power_end trace (TP_printk/variable names) - Variable dummy in power_end must now be cpu_id - Use static 64 bit variable instead of unsigned int for cpu_id [akpm@linux-foundation.org: build fix] Signed-off-by: Thomas Renninger Cc: davej@codemonkey.org.uk Signed-off-by: Ingo Molnar Cc: Dave Jones Acked-by: Arjan van de Ven Cc: Robert Schoene Tested-by: Robert Schoene Signed-off-by: Andrew Morton --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 3 --- arch/x86/kernel/process.c | 8 ++++---- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1d3cddaa40ee..cee5263927c1 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); - switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32f..787572d43d9c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -371,7 +371,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - trace_power_start(POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -457,7 +457,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) static void mwait_idle(void) { if (!need_resched()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -478,7 +478,7 @@ static void mwait_idle(void) */ static void poll_idle(void) { - trace_power_start(POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); -- cgit v1.2.1 From bee6ab53e652a414af20392899879b58cd80d033 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 14 May 2010 12:39:33 +0100 Subject: x86: early PV on HVM features initialization. Initialize basic pv on hvm features adding a new Xen HVM specific hypervisor_x86 structure. Don't try to initialize xen-kbdfront and xen-fbfront when running on HVM because the backends are not available. Signed-off-by: Stefano Stabellini Signed-off-by: Sheng Yang Signed-off-by: Yaozu (Eddie) Dong Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/cpu/hypervisor.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index dd531cc56a8f..bffd47c10fed 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,6 +34,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { &x86_hyper_vmware, &x86_hyper_ms_hyperv, + &x86_hyper_xen_hvm, }; const struct hypervisor_x86 *x86_hyper; -- cgit v1.2.1 From 38e20b07efd541a959de367dc90a17f92ce2e8a6 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 14 May 2010 12:40:51 +0100 Subject: x86/xen: event channels delivery on HVM. Set the callback to receive evtchns from Xen, using the callback vector delivery mechanism. The traditional way for receiving event channel notifications from Xen is via the interrupts from the platform PCI device. The callback vector is a newer alternative that allow us to receive notifications on any vcpu and doesn't need any PCI support: we allocate a vector exclusively to receive events, in the vector handler we don't need to interact with the vlapic, therefore we avoid a VMEXIT. Signed-off-by: Stefano Stabellini Signed-off-by: Sheng Yang Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/entry_32.S | 3 +++ arch/x86/kernel/entry_64.S | 3 +++ 2 files changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141cf153..6b196834a0dd 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1166,6 +1166,9 @@ ENTRY(xen_failsafe_callback) .previous ENDPROC(xen_failsafe_callback) +BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, + xen_evtchn_do_upcall) + #endif /* CONFIG_XEN */ #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0697ff139837..490ae2bb18a8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback) CFI_ENDPROC END(xen_failsafe_callback) +apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ + xen_hvm_callback_vector xen_evtchn_do_upcall + #endif /* CONFIG_XEN */ /* -- cgit v1.2.1 From b43275d661baa5f1f72dacd9033d6eda09d9fe87 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 26 Jul 2010 10:38:45 -0700 Subject: xen/pvhvm: fix build problem when !CONFIG_XEN x86_hyper_xen_hvm is only defined when Xen is enabled in the kernel config. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/cpu/hypervisor.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index bffd47c10fed..5bccedcb9121 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,7 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { &x86_hyper_vmware, &x86_hyper_ms_hyperv, +#ifdef CONFIG_XEN &x86_hyper_xen_hvm, +#endif }; const struct hypervisor_x86 *x86_hyper; -- cgit v1.2.1 From 8c73626ab28527b7eb7f3061c027fbfe530c488c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 13 Jul 2010 17:56:18 -0700 Subject: x86: Fix vtime/file timestamp inconsistencies Due to vtime calling vgettimeofday(), its possible that an application could call time();create("stuff",O_RDRW); only to see the file's creation timestamp to be before the value returned by time. A similar way to reproduce the issue is to compare the vsyscall time() with the syscall time(), and observe ordering issues. The modified test case from Oleg Nesterov below can illustrate this: int main(void) { time_t sec1,sec2; do { sec1 = time(&sec2); sec2 = syscall(__NR_time, NULL); } while (sec1 <= sec2); printf("vtime: %d.000000\n", sec1); printf("time: %d.000000\n", sec2); return 0; } The proper fix is to make vtime use the same time value as current_kernel_time() (which is exported via update_vsyscall) instead of vgettime(). Thanks to Jiri Olsa for bringing up the issue and catching bugs in earlier verisons of this fix. Signed-off-by: John Stultz Cc: Jiri Olsa Cc: Oleg Nesterov LKML-Reference: <1279068988-21864-2-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1c0c6ab9c60f..dce0c3c5a787 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - struct timeval tv; + unsigned seq; time_t result; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - vgettimeofday(&tv, NULL); - result = tv.tv_sec; + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + + result = __vsyscall_gtod_data.wall_time_sec; + + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + if (t) *t = result; return result; -- cgit v1.2.1 From 7615856ebfee52b080c22d263ca4debbd0df0ac1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 13 Jul 2010 17:56:23 -0700 Subject: timkeeping: Fix update_vsyscall to provide wall_to_monotonic offset update_vsyscall() did not provide the wall_to_monotoinc offset, so arch specific implementations tend to reference wall_to_monotonic directly. This limits future cleanups in the timekeeping core, so this patch fixes the update_vsyscall interface to provide wall_to_monotonic, allowing wall_to_monotonic to be made static as planned in Documentation/feature-removal-schedule.txt Signed-off-by: John Stultz Cc: Martin Schwidefsky Cc: Anton Blanchard Cc: Paul Mackerras Cc: Tony Luck LKML-Reference: <1279068988-21864-7-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/vsyscall_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dce0c3c5a787..dcbb28c4b694 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -73,8 +73,8 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, - u32 mult) +void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) { unsigned long flags; @@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -- cgit v1.2.1 From f12a15be63d1de9a35971f35f06b73088fa25c3a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 13 Jul 2010 17:56:27 -0700 Subject: x86: Convert common clocksources to use clocksource_register_hz/khz This converts the most common of the x86 clocksources over to use clocksource_register_hz/khz. Signed-off-by: John Stultz LKML-Reference: <1279068988-21864-11-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 13 +++++++++---- arch/x86/kernel/tsc.c | 5 +---- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba390d731175..33dbcc4ec5ff 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -16,7 +16,6 @@ #include #define HPET_MASK CLOCKSOURCE_MASK(32) -#define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ @@ -787,7 +786,6 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = HPET_MASK, - .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, #ifdef CONFIG_X86_64 @@ -798,6 +796,7 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { u64 start, now; + u64 hpet_freq; cycle_t t1; /* Start the counter */ @@ -832,9 +831,15 @@ static int hpet_clocksource_register(void) * mult = (hpet_period * 2^shift)/10^6 * mult = (hpet_period << shift)/FSEC_PER_NSEC */ - clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); - clocksource_register(&clocksource_hpet); + /* Need to convert hpet_period (fsec/cyc) to cyc/sec: + * + * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) + * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period + */ + hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC; + do_div(hpet_freq, hpet_period); + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); return 0; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9faf91ae1841..ce8e50239332 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = { .read = read_tsc, .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), - .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, #ifdef CONFIG_X86_64 @@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void) static void __init init_tsc_clocksource(void) { - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); if (tsc_clocksource_reliable) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ @@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } - clocksource_register(&clocksource_tsc); + clocksource_register_khz(&clocksource_tsc, tsc_khz); } #ifdef CONFIG_X86_64 -- cgit v1.2.1 From 80a506b8fdcfa868bb53eb740f928217d0966fc1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 27 Jul 2010 17:14:24 +0200 Subject: x86/amd-iommu: Export cache-coherency capability This patch exports the capability of the AMD IOMMU to force cache coherency of DMA transactions through the IOMMU-API. This is required to disable some nasty hacks in KVM when this capability is not available. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 29dd3b9f2f09..fa044e1e30a2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -2572,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, static int amd_iommu_domain_has_cap(struct iommu_domain *domain, unsigned long cap) { + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + return 1; + } + return 0; } -- cgit v1.2.1 From 11637e4b7dc098e9a863f0a619d55ebc60f5949e Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 17 Dec 2009 21:24:25 -0500 Subject: fanotify: fanotify_init syscall declaration This patch defines a new syscall fanotify_init() of the form: int sys_fanotify_init(unsigned int flags, unsigned int event_f_flags, unsigned int priority) This syscall is used to create and fanotify group. This is very similar to the inotify_init() syscall. Signed-off-by: Eric Paris --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b3729341216..e38793b50e1d 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -337,3 +337,4 @@ ENTRY(sys_call_table) .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open .long sys_recvmmsg + .long sys_fanotify_init -- cgit v1.2.1 From bbaa4168b2d2d8cc674e6d35806e8426aef464b8 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 17 Dec 2009 21:24:26 -0500 Subject: fanotify: sys_fanotify_mark declartion This patch simply declares the new sys_fanotify_mark syscall int fanotify_mark(int fanotify_fd, unsigned int flags, u64_mask, int dfd const char *pathname) Signed-off-by: Eric Paris --- arch/x86/kernel/syscall_table_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index e38793b50e1d..07ad5eb7cc5c 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -338,3 +338,4 @@ ENTRY(sys_call_table) .long sys_perf_event_open .long sys_recvmmsg .long sys_fanotify_init + .long sys_fanotify_mark -- cgit v1.2.1 From d78d671db478eb8b14c78501c0cee1cc7baf6967 Mon Sep 17 00:00:00 2001 From: Hans Rosenfeld Date: Wed, 28 Jul 2010 19:09:30 +0200 Subject: x86, cpu: AMD errata checking framework Errata are defined using the AMD_LEGACY_ERRATUM() or AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that have an OSVW id assigned, which it takes as first argument. Both take a variable number of family-specific model-stepping ranges created by AMD_MODEL_RANGE(). Iff an erratum has an OSVW id, OSVW is available on the CPU, and the OSVW id is known to the hardware, it is used to determine whether an erratum is present. Otherwise, the model-stepping ranges are matched against the current CPU to find out whether the erratum applies. For certain special errata, the code using this framework might have to conduct further checks to make sure an erratum is really (not) present. Signed-off-by: Hans Rosenfeld LKML-Reference: <1280336972-865982-1-git-send-email-hans.rosenfeld@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 12b9cff047c1..80665410b064 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -609,3 +609,63 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { }; cpu_dev_register(amd_cpu_dev); + +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const + * int[] in arch/x86/include/asm/processor.h. + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +bool cpu_has_amd_erratum(const int *erratum) +{ + struct cpuinfo_x86 *cpu = ¤t_cpu_data; + int osvw_id = *erratum++; + u32 range; + u32 ms; + + /* + * If called early enough that current_cpu_data hasn't been initialized + * yet, fall back to boot_cpu_data. + */ + if (cpu->x86 == 0) + cpu = &boot_cpu_data; + + if (cpu->x86_vendor != X86_VENDOR_AMD) + return false; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 8) | cpu->x86_mask; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} -- cgit v1.2.1 From 9d8888c2a214aece2494a49e699a097c2ba9498b Mon Sep 17 00:00:00 2001 From: Hans Rosenfeld Date: Wed, 28 Jul 2010 19:09:31 +0200 Subject: x86, cpu: Clean up AMD erratum 400 workaround Remove check_c1e_idle() and use the new AMD errata checking framework instead. Signed-off-by: Hans Rosenfeld LKML-Reference: <1280336972-865982-2-git-send-email-hans.rosenfeld@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 5 +++++ arch/x86/kernel/process.c | 39 ++------------------------------------- 2 files changed, 7 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 80665410b064..a62a4ae7a11a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -628,6 +628,11 @@ cpu_dev_register(amd_cpu_dev); * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); */ +const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); + + bool cpu_has_amd_erratum(const int *erratum) { struct cpuinfo_x86 *cpu = ¤t_cpu_data; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32f..553b02f13094 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -525,42 +525,6 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } -/* - * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. - * For more information see - * - Erratum #400 for NPT family 0xf and family 0x10 CPUs - * - Erratum #365 for family 0x11 (not affected because C1e not in use) - */ -static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) -{ - u64 val; - if (c->x86_vendor != X86_VENDOR_AMD) - goto no_c1e_idle; - - /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0F && c->x86_model >= 0x40) - return 1; - - if (c->x86 == 0x10) { - /* - * check OSVW bit for CPUs that are not affected - * by erratum #400 - */ - if (cpu_has(c, X86_FEATURE_OSVW)) { - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); - if (val >= 2) { - rdmsrl(MSR_AMD64_OSVW_STATUS, val); - if (!(val & BIT(1))) - goto no_c1e_idle; - } - } - return 1; - } - -no_c1e_idle: - return 0; -} - static cpumask_var_t c1e_mask; static int c1e_detected; @@ -638,7 +602,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) */ printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; - } else if (check_c1e_idle(c)) { + } else if (cpu_has_amd_erratum(amd_erratum_400)) { + /* E400: APIC timer interrupt does not wake up CPU from C1e */ printk(KERN_INFO "using C1E aware idle routine\n"); pm_idle = c1e_idle; } else -- cgit v1.2.1 From 1be85a6d93f4207d8c2c6238c4a96895e28cefba Mon Sep 17 00:00:00 2001 From: Hans Rosenfeld Date: Wed, 28 Jul 2010 19:09:32 +0200 Subject: x86, cpu: Use AMD errata checking framework for erratum 383 Use the AMD errata checking framework instead of open-coding the test. Signed-off-by: Hans Rosenfeld LKML-Reference: <1280336972-865982-3-git-send-email-hans.rosenfeld@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a62a4ae7a11a..30f30dcbdb82 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -632,6 +632,8 @@ const int amd_erratum_400[] = AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); +const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); bool cpu_has_amd_erratum(const int *erratum) { -- cgit v1.2.1 From a5b91606bdc9d0a0d036d2d829a22921c705573e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Jul 2010 16:23:20 -0700 Subject: x86, cpu: Export AMD errata definitions Exprot the AMD errata definitions, since they are needed by kvm_amd.ko if that is built as a module. Doing "make allmodconfig" during testing would have caught this. Signed-off-by: H. Peter Anvin Cc: Hans Rosenfeld LKML-Reference: <1280336972-865982-1-git-send-email-hans.rosenfeld@amd.com> --- arch/x86/kernel/cpu/amd.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 30f30dcbdb82..60a57b13082d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -631,9 +631,11 @@ cpu_dev_register(amd_cpu_dev); const int amd_erratum_400[] = AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); +EXPORT_SYMBOL_GPL(amd_erratum_400); const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); +EXPORT_SYMBOL_GPL(amd_erratum_383); bool cpu_has_amd_erratum(const int *erratum) { @@ -676,3 +678,5 @@ bool cpu_has_amd_erratum(const int *erratum) return false; } + +EXPORT_SYMBOL_GPL(cpu_has_amd_erratum); -- cgit v1.2.1 From 90c8f92f5c807807ca74d5f2f313794925174e6b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Jul 2010 16:53:49 -0700 Subject: x86, asm: Move cmpxchg emulation code to arch/x86/lib Move cmpxchg emulation code from arch/x86/kernel/cpu (which is otherwise CPU identification) to arch/x86/lib, where other emulation code lives already. Signed-off-by: H. Peter Anvin LKML-Reference: --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/cmpxchg.c | 72 ------------------------------------------- 2 files changed, 1 insertion(+), 73 deletions(-) delete mode 100644 arch/x86/kernel/cpu/cmpxchg.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3a785da34b6f..c47c43914ba7 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -16,7 +16,7 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o -obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o +obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o obj-$(CONFIG_CPU_SUP_INTEL) += intel.o diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c deleted file mode 100644 index 2056ccf572cc..000000000000 --- a/arch/x86/kernel/cpu/cmpxchg.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * cmpxchg*() fallbacks for CPU not supporting these instructions - */ - -#include -#include -#include - -#ifndef CONFIG_X86_CMPXCHG -unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) -{ - u8 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u8 *)ptr; - if (prev == old) - *(u8 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u8); - -unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) -{ - u16 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u16 *)ptr; - if (prev == old) - *(u16 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u16); - -unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) -{ - u32 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u32 *)ptr; - if (prev == old) - *(u32 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u32); -#endif - -#ifndef CONFIG_X86_CMPXCHG64 -unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) -{ - u64 prev; - unsigned long flags; - - /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u64 *)ptr; - if (prev == old) - *(u64 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_486_u64); -#endif - -- cgit v1.2.1 From ca65f9fc0c447da5b270b05c41c21b19c88617c3 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 29 Jul 2010 14:37:48 +0100 Subject: Introduce CONFIG_XEN_PVHVM compile option This patch introduce a CONFIG_XEN_PVHVM compile time option to enable/disable Xen PV on HVM support. Signed-off-by: Stefano Stabellini --- arch/x86/kernel/cpu/hypervisor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 5bccedcb9121..8095f8611f8a 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,7 +34,7 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { &x86_hyper_vmware, &x86_hyper_ms_hyperv, -#ifdef CONFIG_XEN +#ifdef CONFIG_XEN_PVHVM &x86_hyper_xen_hvm, #endif }; -- cgit v1.2.1 From 30da55242818a8ca08583188ebcbaccd283ad4d9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 23 Jul 2010 14:56:28 +0100 Subject: PCI: MSI: Restore read_msi_msg_desc(); add get_cached_msi_msg_desc() commit 2ca1af9aa3285c6a5f103ed31ad09f7399fc65d7 "PCI: MSI: Remove unsafe and unnecessary hardware access" changed read_msi_msg_desc() to return the last MSI message written instead of reading it from the device, since it may be called while the device is in a reduced power state. However, the pSeries platform code really does need to read messages from the device, since they are initially written by firmware. Therefore: - Restore the previous behaviour of read_msi_msg_desc() - Add new functions get_cached_msi_msg{,_desc}() which return the last MSI message written - Use the new functions where appropriate Acked-by: Michael Ellerman Signed-off-by: Ben Hutchings Signed-off-by: Jesse Barnes --- arch/x86/kernel/apic/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e41ed24ab26d..4dc0084ec1b1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3397,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) cfg = desc->chip_data; - read_msi_msg_desc(desc, &msg); + get_cached_msi_msg_desc(desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); -- cgit v1.2.1 From 68f202e4e87cfab4439568bf397fcc5c7cf8d729 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 30 Jul 2010 11:46:42 -0700 Subject: x86, mtrr: Use stop machine context to rendezvous all the cpu's Use the stop machine context rather than IPI's to rendezvous all the cpus for MTRR initialization that happens during cpu bringup or for MTRR modifications during runtime. This avoids deadlock scenario (reported by Prarit) like: cpu A holds a read_lock (tasklist_lock for example) with irqs enabled cpu B waits for the same lock with irqs disabled using write_lock_irq cpu C doing set_mtrr() (during AP bringup for example), which will try to rendezvous all the cpus using IPI's This will result in C and A come to the rendezvous point and waiting for B. B is stuck forever waiting for the lock and thus not reaching the rendezvous point. Using stop cpu (run in the process context of per cpu based keventd) to do this rendezvous, avoids this deadlock scenario. Also make sure all the cpu's are in the rendezvous handler before we proceed with the local_irq_save() on each cpu. This lock step disabling irqs on all the cpus will avoid other deadlock scenarios (for example involving with the blocking smp_call_function's etc). [ This problem is very old. Marking -stable only for 2.6.35 as the stop_one_cpu_nowait() API is present only in 2.6.35. Any older kernel interested in this fix need to do some more work in backporting this patch. ] Reported-by: Prarit Bhargava Signed-off-by: Suresh Siddha LKML-Reference: <1280515602.2682.10.camel@sbsiddha-MOBL3.sc.intel.com> Acked-by: Prarit Bhargava Cc: stable@kernel.org [2.6.35] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 56 +++++++++++++++++++++++++++++++---------- arch/x86/kernel/smpboot.c | 7 ++++++ 2 files changed, 50 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 79556bd9b602..01c0f3ee6cc3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -35,6 +35,7 @@ #include /* FIXME: kvm_para.h needs this */ +#include #include #include #include @@ -143,22 +144,28 @@ struct set_mtrr_data { mtrr_type smp_type; }; +static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work); + /** - * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. * @info: pointer to mtrr configuration data * * Returns nothing. */ -static void ipi_handler(void *info) +static int mtrr_work_handler(void *info) { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; unsigned long flags; + atomic_dec(&data->count); + while (!atomic_read(&data->gate)) + cpu_relax(); + local_irq_save(flags); atomic_dec(&data->count); - while (!atomic_read(&data->gate)) + while (atomic_read(&data->gate)) cpu_relax(); /* The master has cleared me to execute */ @@ -173,12 +180,13 @@ static void ipi_handler(void *info) } atomic_dec(&data->count); - while (atomic_read(&data->gate)) + while (!atomic_read(&data->gate)) cpu_relax(); atomic_dec(&data->count); local_irq_restore(flags); #endif + return 0; } static inline int types_compatible(mtrr_type type1, mtrr_type type2) @@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: * - * 1. Send IPI to do the following: + * 1. Queue work to do the following on all processors: * 2. Disable Interrupts * 3. Wait for all procs to do so * 4. Enter no-fill cache mode @@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * 15. Enable interrupts. * * What does that mean for us? Well, first we set data.count to the number - * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait - * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * of CPUs. As each CPU announces that it started the rendezvous handler by + * decrementing the count, We reset data.count and set the data.gate flag + * allowing all the cpu's to proceed with the work. As each cpu disables + * interrupts, it'll decrement data.count once. We wait until it hits 0 and + * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they + * are waiting for that flag to be cleared. Once it's cleared, each * CPU goes through the transition of updating MTRRs. * The CPU vendors may each do it differently, * so we call mtrr_if->set() callback and let them take care of it. * When they're done, they again decrement data->count and wait for data.gate - * to be reset. + * to be set. * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag * Everyone then enables interrupts and we all continue on. * @@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ { struct set_mtrr_data data; unsigned long flags; + int cpu; + + preempt_disable(); data.smp_reg = reg; data.smp_base = base; @@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ atomic_set(&data.gate, 0); /* Start the ball rolling on other CPUs */ - if (smp_call_function(ipi_handler, &data, 0) != 0) - panic("mtrr: timed out waiting for other CPUs\n"); + for_each_online_cpu(cpu) { + struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu); + + if (cpu == smp_processor_id()) + continue; + + stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work); + } - local_irq_save(flags); while (atomic_read(&data.count)) cpu_relax(); @@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ smp_wmb(); atomic_set(&data.gate, 1); + local_irq_save(flags); + + while (atomic_read(&data.count)) + cpu_relax(); + + /* Ok, reset count and toggle gate */ + atomic_set(&data.count, num_booting_cpus() - 1); + smp_wmb(); + atomic_set(&data.gate, 0); + /* Do our MTRR business */ /* @@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate, 0); + atomic_set(&data.gate, 1); /* * Wait here for everyone to have seen the gate change @@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ cpu_relax(); local_irq_restore(flags); + preempt_enable(); } /** diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c4f33b2e77d6..11015fd1abbc 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -816,6 +816,13 @@ do_rest: if (cpumask_test_cpu(cpu, cpu_callin_mask)) break; /* It has booted */ udelay(100); + /* + * Allow other tasks to run while we wait for the + * AP to come online. This also gives a chance + * for the MTRR work(triggered by the AP coming online) + * to be completed in the stop machine context. + */ + schedule(); } if (cpumask_test_cpu(cpu, cpu_callin_mask)) -- cgit v1.2.1 From 9792db6174d9927700ed288e6d74b9391bf785d1 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 29 Jul 2010 17:13:42 -0700 Subject: x86, cpu: Package Level Thermal Control, Power Limit Notification definitions Add package level thermal and power limit feature support. The two MSRs and features are new starting with Intel's Sandy Bridge processor. Please check Intel 64 and IA-32 Architectures SDMV Vol 3A 14.5.6 Power Limit Notification and 14.6 Package Level Thermal Management. This patch also fixes a bug which defines reverse THERM_INT_LOW_ENABLE bit and THERM_INT_HIGH_ENABLE bit. [ hpa: fixed up against current tip:x86/cpu ] Signed-off-by: Fenghua Yu LKML-Reference: <1280448826-12004-2-git-send-email-fenghua.yu@intel.com> Reviewed-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/scattered.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 9815364b477e..34b4dad6f0b8 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -33,6 +33,8 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, + { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, -- cgit v1.2.1 From 25971865d48a8d0ece5307a59dbd3f06d05a7567 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 16 Jun 2010 23:19:28 -0400 Subject: x86, olpc: Use pr_debug() for EC commands Unconditionally printing EC debug messages was helpful when we were actually debugging the EC, but during normal operation it can get pretty annoying. Using pr_debug allows us finer-grained control. Signed-off-by: Andres Salomon LKML-Reference: <20100616231928.16b539f0@dev.queued.net> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/olpc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 156605281f56..f5ff3903b38b 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -142,7 +142,7 @@ restart: * The OBF flag will sometimes misbehave due to what we believe * is a hardware quirk.. */ - printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); + pr_devel("olpc-ec: running cmd 0x%x\n", cmd); outb(cmd, 0x6c); if (wait_on_ibf(0x6c, 0)) { @@ -159,8 +159,7 @@ restart: " EC accept data!\n"); goto err; } - printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", - inbuf[i]); + pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); outb(inbuf[i], 0x68); } } @@ -173,8 +172,7 @@ restart: goto restart; } outbuf[i] = inb(0x68); - printk(KERN_DEBUG "olpc-ec: received 0x%x\n", - outbuf[i]); + pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); } } -- cgit v1.2.1 From 54e5bc020ce1c959eaa7be18cedb734b6b13745e Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Mon, 28 Jun 2010 22:00:29 -0400 Subject: x86, olpc: Constify an olpc_ofw() arg The arguments passed to OFW shouldn't be modified; update the 'args' argument of olpc_ofw to reflect this. This saves us some later casting away of consts. Signed-off-by: Andres Salomon LKML-Reference: <20100628220029.1555ac24@debian> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/olpc.c | 2 +- arch/x86/kernel/olpc_ofw.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index f5ff3903b38b..0e0cdde519be 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -188,7 +188,7 @@ static void __init platform_detect(void) { size_t propsize; __be32 rev; - void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; + const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; void *res[] = { &propsize }; if (olpc_ofw("getprop", args, res) || propsize != 4) { diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c index f5d499fbe74f..3218aa71ab5e 100644 --- a/arch/x86/kernel/olpc_ofw.c +++ b/arch/x86/kernel/olpc_ofw.c @@ -40,7 +40,7 @@ void __init setup_olpc_ofw_pgd(void) early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); } -int __olpc_ofw(const char *name, int nr_args, void **args, int nr_res, +int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, void **res) { int ofw_args[MAXARGS + 3]; -- cgit v1.2.1 From c4026cfd8febcd63dd278894108839f30e525a0e Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 30 Jul 2010 14:10:55 -0500 Subject: x86, UV: Initialize BAU hub map Fix uninitialized uvhub_mask: - An unitialized bit map variable was causing initialization of non-existant hubs (this one causes boot panics). - And the bit map was too small for large machines. This patch makes it dynamic in size. - Fix the case where socket 0 has no enabled cpu's. Don't assume every hub has a socket 0. - uv_init_per_cpu() should be __init. Signed-off-by: Cliff Wickman Cc: # for .35.x LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 59efb5390b37..312ef0292815 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -1484,15 +1484,16 @@ calculate_destination_timeout(void) /* * initialize the bau_control structure for each cpu */ -static void uv_init_per_cpu(int nuvhubs) +static void __init uv_init_per_cpu(int nuvhubs) { int i; int cpu; int pnode; int uvhub; + int have_hmaster; short socket = 0; unsigned short socket_mask; - unsigned int uvhub_mask; + unsigned char *uvhub_mask; struct bau_control *bcp; struct uvhub_desc *bdp; struct socket_desc *sdp; @@ -1516,28 +1517,29 @@ static void uv_init_per_cpu(int nuvhubs) uvhub_descs = (struct uvhub_desc *) kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); + uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); memset(bcp, 0, sizeof(struct bau_control)); pnode = uv_cpu_hub_info(cpu)->pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; - uvhub_mask |= (1 << uvhub); + *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); bdp = &uvhub_descs[uvhub]; bdp->num_cpus++; bdp->uvhub = uvhub; bdp->pnode = pnode; /* kludge: 'assuming' one node per socket, and assuming that disabling a socket just leaves a gap in node numbers */ - socket = (cpu_to_node(cpu) & 1);; + socket = (cpu_to_node(cpu) & 1); bdp->socket_mask |= (1 << socket); sdp = &bdp->socket[socket]; sdp->cpu_number[sdp->num_cpus] = cpu; sdp->num_cpus++; } - uvhub = 0; - while (uvhub_mask) { - if (!(uvhub_mask & 1)) - goto nexthub; + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { + if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) + continue; + have_hmaster = 0; bdp = &uvhub_descs[uvhub]; socket_mask = bdp->socket_mask; socket = 0; @@ -1551,8 +1553,10 @@ static void uv_init_per_cpu(int nuvhubs) bcp->cpu = cpu; if (i == 0) { smaster = bcp; - if (socket == 0) + if (!have_hmaster) { + have_hmaster++; hmaster = bcp; + } } bcp->cpus_in_uvhub = bdp->num_cpus; bcp->cpus_in_socket = sdp->num_cpus; @@ -1566,11 +1570,9 @@ nextsocket: socket++; socket_mask = (socket_mask >> 1); } -nexthub: - uvhub++; - uvhub_mask = (uvhub_mask >> 1); } kfree(uvhub_descs); + kfree(uvhub_mask); for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; -- cgit v1.2.1 From 5ee481da7b62a992b91f958bf26aaaa92354c170 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 17 May 2010 17:22:23 +0800 Subject: x86: Export FPU API for KVM use Also add some constants. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kernel/i387.c | 3 ++- arch/x86/kernel/process.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 86cef6b32253..c4444bce8469 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -107,7 +107,7 @@ void __cpuinit fpu_init(void) } #endif /* CONFIG_X86_64 */ -static void fpu_finit(struct fpu *fpu) +void fpu_finit(struct fpu *fpu) { #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { @@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu) fp->fos = 0xffff0000u; } } +EXPORT_SYMBOL_GPL(fpu_finit); /* * The _current_ task is using the FPU for the first time diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32f..ebcfcceccc72 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,6 +28,7 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +EXPORT_SYMBOL_GPL(task_xstate_cachep); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { -- cgit v1.2.1 From c15a5958a0b6dbf06b3c05972694f04a0c50a4cf Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 31 Jul 2010 12:48:22 -0400 Subject: x86-64, asm: Directly access per-cpu IST Use a direct per-cpu reference for the IST instead of using a scratch register. Signed-off-by: Brian Gerst LKML-Reference: <1280594903-6341-1-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 4db7c4d12ffa..59af275b37a2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1065,6 +1065,7 @@ ENTRY(\sym) END(\sym) .endm +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) INTR_FRAME @@ -1076,10 +1077,9 @@ ENTRY(\sym) TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - PER_CPU(init_tss, %r12) - subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) + subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) call \do_sym - addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) + addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) -- cgit v1.2.1 From 72c511dd596cff88d6523f231a0fbb8f73006d51 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 31 Jul 2010 12:48:23 -0400 Subject: x86-32, asm: Directly access per-cpu GDT Use a direct per-cpu reference for the GDT instead of using a scratch register. Signed-off-by: Brian Gerst LKML-Reference: <1280594903-6341-2-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cd49141cf153..233c5829e7ac 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -611,14 +611,14 @@ ldt_ss: * compensating for the offset by changing to the ESPFIX segment with * a base address that matches for the difference. */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) mov %esp, %edx /* load kernel esp */ mov PT_OLDESP(%esp), %eax /* load userspace esp */ mov %dx, %ax /* eax: new kernel esp */ sub %eax, %edx /* offset (low word is 0) */ - PER_CPU(gdt_page, %ebx) shr $16, %edx - mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ - mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ pushl $__ESPFIX_SS CFI_ADJUST_CFA_OFFSET 4 push %eax /* new kernel esp */ @@ -791,9 +791,8 @@ ptregs_clone: * normal stack and adjusts ESP with the matching offset. */ /* fixup the stack */ - PER_CPU(gdt_page, %ebx) - mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ - mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS -- cgit v1.2.1 From e8c534ec068af1a0845aceda373a9bfd2de62030 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Tue, 27 Jul 2010 18:53:35 +0200 Subject: x86: Fix keeping track of AMD C1E Accomodate the original C1E-aware idle routine to the different times during boot when the BIOS enables C1E. While at it, remove the synthetic CPUID flag in favor of a single global setting which denotes C1E status on the system. [ hpa: changed c1e_enabled to be a bool; clarified cpu bit 3:21 comment ] Signed-off-by: Michal Schmidt LKML-Reference: <20100727165335.GA11630@aftab> Signed-off-by: Borislav Petkov Signed-off-by: H. Peter Anvin Acked-by: Thomas Gleixner --- arch/x86/kernel/process.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 553b02f13094..b944f89c4e6e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -525,8 +525,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } +bool c1e_detected; +EXPORT_SYMBOL(c1e_detected); + static cpumask_var_t c1e_mask; -static int c1e_detected; void c1e_remove_cpu(int cpu) { @@ -548,12 +550,12 @@ static void c1e_idle(void) u32 lo, hi; rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) { - c1e_detected = 1; + c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); - set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } -- cgit v1.2.1 From fe96eb404e33b59bb39f7050205f7c56c1c7d686 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 18 Mar 2010 13:53:24 -0400 Subject: x86: Detect whether we should use Xen SWIOTLB. It is paramount that we call pci_xen_swiotlb_detect before pci_swiotlb_detect as both implementations use the 'swiotlb' and 'swiotlb_force' flags. The pci-xen_swiotlb_detect inhibits the swiotlb_force and swiotlb flag so that the native SWIOTLB implementation is not enabled when running under Xen. [since v1 changed two Cc's to Acked-by] Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Jeremy Fitzhardinge Acked-by: FUJITA Tomonori [http://lkml.org/lkml/2010/7/27/374] Cc: Albert Herranz Cc: Ian Campbell Cc: Thomas Gleixner Acked-by: "H. Peter Anvin" [conditional http://lkml.org/lkml/2010/8/2/324] Cc: x86@kernel.org Cc: Jesse Barnes --- arch/x86/kernel/pci-dma.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 4b7e3d8b01dd..9f07cfcbd3a5 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -13,6 +13,7 @@ #include #include #include +#include static int forbid_dac __read_mostly; @@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void) /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); - if (pci_swiotlb_detect()) + if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) goto out; gart_iommu_hole_init(); @@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void) /* needs to be called after gart_iommu_hole_init */ amd_iommu_detect(); out: + pci_xen_swiotlb_init(); + pci_swiotlb_init(); } @@ -296,7 +299,7 @@ static int __init pci_iommu_init(void) #endif x86_init.iommu.iommu_init(); - if (swiotlb) { + if (swiotlb || xen_swiotlb) { printk(KERN_INFO "PCI-DMA: " "Using software bounce buffering for IO (SWIOTLB)\n"); swiotlb_print_info(); -- cgit v1.2.1 From 9f242dc10e0c3c1eb32d8c83c18650a35fd7f80d Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 2 Aug 2010 16:10:37 -0700 Subject: x86, vmware: Preset lpj values when on VMware. When running on VMware's platform, we have seen situations where the AP's try to calibrate the lpj values and fail to get good calibration runs becasue of timing issues. As a result delays don't work correctly on all cpus. The solutions is to set preset_lpj value based on the current tsc frequency value. This is similar to what KVM does as well. Signed-off-by: Alok N Kataria LKML-Reference: <1280790637.14933.29.camel@ank32.eng.vmware.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/vmware.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index b9d1ff588445..227b0448960d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -51,7 +51,7 @@ static inline int __vmware_platform(void) static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz; + uint64_t tsc_hz, lpj; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); @@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void) printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", (unsigned long) tsc_hz / 1000, (unsigned long) tsc_hz % 1000); + + if (!preset_lpj) { + lpj = ((u64)tsc_hz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; + } + return tsc_hz; } -- cgit v1.2.1 From 98a5ae2d99b78d29d2d31283cd8b481a44f41fd3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 18 May 2010 13:59:05 +0200 Subject: x86, mce: Notify about corrected events too Notify all parties registered on the mce decoder chain about logged correctable MCEs. Signed-off-by: Borislav Petkov Acked-by: Doug Thompson Acked-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 18cc42562250..1970ef911c99 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -600,6 +600,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { mce_log(&m); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); add_taint(TAINT_MACHINE_CHECK); } -- cgit v1.2.1 From 5d77b85458f656923b85291a4ff56ed44859ed52 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Tue, 20 Jul 2010 13:52:00 -0400 Subject: [CPUFREQ] pcc driver should check for pcch method before calling _OSC The pcc specification documents an _OSC method that's incompatible with the one defined as part of the ACPI spec. This shouldn't be a problem as both are supposed to be guarded with a UUID. Unfortunately approximately nobody (including HP, who wrote this spec) properly check the UUID on entry to the _OSC call. Right now this could result in surprising behaviour if the pcc driver performs an _OSC call on a machine that doesn't implement the pcc specification. Check whether the PCCH method exists first in order to reduce this probability. Signed-off-by: Matthew Garrett Cc: Naga Chumbalkar Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index ce7cde713e71..01bd25c3c7ca 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -397,13 +397,17 @@ static int __init pcc_cpufreq_probe(void) struct pcc_memory_resource *mem_resource; struct pcc_register_resource *reg_resource; union acpi_object *out_obj, *member; - acpi_handle handle, osc_handle; + acpi_handle handle, osc_handle, pcch_handle; int ret = 0; status = acpi_get_handle(NULL, "\\_SB", &handle); if (ACPI_FAILURE(status)) return -ENODEV; + status = acpi_get_handle(handle, "PCCH", &pcch_handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + status = acpi_get_handle(handle, "_OSC", &osc_handle); if (ACPI_SUCCESS(status)) { ret = pcc_cpufreq_do_osc(&osc_handle); -- cgit v1.2.1 From 0d9715d64fe118dd0957a29e344972b8d3f960e7 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 23 Jul 2010 23:06:52 +0100 Subject: [CPUFREQ] fix double freeing in error path of pcc-cpufreq Prevent double freeing on error path. Signed-off-by: Daniel J Blueman Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 01bd25c3c7ca..900702888bfb 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -368,22 +368,16 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle) return -ENODEV; out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } + if (out_obj->type != ACPI_TYPE_BUFFER) + return -ENODEV; errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } + if (errors) + return -ENODEV; supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } + if (!(supported & 0x1)) + return -ENODEV; out_free: kfree(output.pointer); -- cgit v1.2.1 From 6ebdf777ba034d2b54c99f28a4b18dabf286d8e5 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 15 Jul 2010 11:44:00 -0400 Subject: [CPUFREQ] Fix PCC driver error path The PCC cpufreq driver unmaps the mailbox address range if any CPUs fail to initialise, but doesn't do anything to remove the registered CPUs from the cpufreq core resulting in failures further down the line. We're better off simply returning a failure - the cpufreq core will unregister us cleanly if we end up with no successfully registered CPUs. Tidy up the failure path and also add a sanity check to ensure that the firmware gives us a realistic frequency - the core deals badly with that being set to 0. Signed-off-by: Matthew Garrett Cc: Naga Chumbalkar Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 900702888bfb..a36de5bbb622 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -541,13 +541,13 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!pcch_virt_addr) { result = -1; - goto pcch_null; + goto out; } result = pcc_get_offset(cpu); if (result) { dprintk("init: PCCP evaluation failed\n"); - goto free; + goto out; } policy->max = policy->cpuinfo.max_freq = @@ -556,14 +556,15 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) ioread32(&pcch_hdr->minimum_frequency) * 1000; policy->cur = pcc_get_freq(cpu); + if (!policy->cur) { + dprintk("init: Unable to get current CPU frequency\n"); + result = -EINVAL; + goto out; + } + dprintk("init: policy->max is %d, policy->min is %d\n", policy->max, policy->min); - - return 0; -free: - pcc_clear_mapping(); - free_percpu(pcc_cpu_info); -pcch_null: +out: return result; } -- cgit v1.2.1 From c2f4a2c6e08c7635316dfd25ef706e9104384c56 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 8 Jul 2010 17:55:30 +0200 Subject: [CPUFREQ] powernow-k8: Limit Pstate transition latency check The Pstate transition latency check was added for broken F10h BIOSen which wrongly contain a value of 0 for transition and bus master latency. Fam11h and later, however, (will) have similar transition latency so extend that behavior for them too. Signed-off-by: Borislav Petkov Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 7ec2123838e6..3e90cce3dc8b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1023,13 +1023,12 @@ static int get_transition_latency(struct powernow_k8_data *data) } if (max_latency == 0) { /* - * Fam 11h always returns 0 as transition latency. - * This is intended and means "very fast". While cpufreq core - * and governors currently can handle that gracefully, better - * set it to 1 to avoid problems in the future. - * For all others it's a BIOS bug. + * Fam 11h and later may return 0 as transition latency. This + * is intended and means "very fast". While cpufreq core and + * governors currently can handle that gracefully, better set it + * to 1 to avoid problems in the future. */ - if (boot_cpu_data.x86 != 0x11) + if (boot_cpu_data.x86 < 0x11) printk(KERN_ERR FW_WARN PFX "Invalid zero transition " "latency\n"); max_latency = 1; -- cgit v1.2.1 From 298decfbc44e9a4cb7862ae1b7dfc4e1ba3551b9 Mon Sep 17 00:00:00 2001 From: Marti Raudsepp Date: Wed, 20 Jan 2010 19:19:33 +0200 Subject: [CPUFREQ] powernow-k8: On load failure, remind the user to enable support in BIOS setup On Wed, 2010-01-20 at 16:56 +0100, Thomas Renninger wrote: > But most often this happens if people upgrade their CPU and do not > update their BIOS. > Or the vendor does not recognise the new CPU even if the BIOS got > updated. Maybe some of those people just didn't realize it was disabled in BIOS? If you tell users that it's a firmware bug then they'll probably just give up. > The itself message might be an enhancment, IMO it's not worth a patch. Why do you think so? I spent an hour on hunting down the BIOS upgrade, only to find that it didn't improve anything. It was a day later that I realized that it might be a BIOS option; and the option was literally the _last_ option in the whole BIOS setup. :) This message would have saved the day. > But do not revert the FW_BUG part! Sure, you have a point here. How about this patch? --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 3e90cce3dc8b..c48b44b3b43a 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -806,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data) * www.amd.com */ printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); + printk(KERN_ERR PFX "Make sure that your BIOS is up to date" + " and Cool'N'Quiet support is enabled in BIOS setup\n"); return -ENODEV; } -- cgit v1.2.1 From 6b72e3934b42930fd40fc42fe762d21be413301c Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 20 Apr 2010 13:17:35 +0200 Subject: [CPUFREQ] acpi-cpufreq: Fix CPU_ANY CPUFREQ_{PRE,POST}CHANGE notification Signed-off-by: Thomas Renninger CC: venki@google.com CC: davej@redhat.com CC: arjan@infradead.org CC: linux-kernel@vger.kernel.org Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1d3cddaa40ee..cee7aa949c35 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -351,7 +351,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, freqs.old = perf->states[perf->state].core_frequency * 1000; freqs.new = data->freq_table[next_state].frequency; - for_each_cpu(i, cmd.mask) { + for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -367,7 +367,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - for_each_cpu(i, cmd.mask) { + for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } -- cgit v1.2.1 From 6f4f2723d08534fd4e407e1ef8500b0f4d12c30c Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 20 Apr 2010 13:17:36 +0200 Subject: [CPUFREQ] x86 cpufreq: Make trace_power_frequency cpufreq driver independent and fix the broken case if a core's frequency depends on others. trace_power_frequency was only implemented in a rather ungeneric way in acpi-cpufreq driver's target() function only. -> Move the call to trace_power_frequency to cpufreq.c:cpufreq_notify_transition() where CPUFREQ_POSTCHANGE notifier is triggered. This will support power frequency tracing by all cpufreq drivers trace_power_frequency did not trace frequency changes correctly when the userspace governor was used or when CPU cores' frequency depend on each other. -> Moving this into the CPUFREQ_POSTCHANGE notifier and pass the cpu which gets switched automatically fixes this. Robert Schoene provided some important fixes on top of my initial quick shot version which are integrated in this patch: - Forgot some changes in power_end trace (TP_printk/variable names) - Variable dummy in power_end must now be cpu_id - Use static 64 bit variable instead of unsigned int for cpu_id Signed-off-by: Thomas Renninger CC: davej@redhat.com CC: arjan@infradead.org CC: linux-kernel@vger.kernel.org CC: robert.schoene@tu-dresden.de Tested-by: robert.schoene@tu-dresden.de Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 3 --- arch/x86/kernel/process.c | 8 ++++---- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index cee7aa949c35..246cd3afbb5f 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -324,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); - switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32f..787572d43d9c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -371,7 +371,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -441,7 +441,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - trace_power_start(POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -457,7 +457,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) static void mwait_idle(void) { if (!need_resched()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -478,7 +478,7 @@ static void mwait_idle(void) */ static void poll_idle(void) { - trace_power_start(POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); -- cgit v1.2.1 From ccc5638a20b0eb3a66666d9d4dd8fe8f5ad40386 Mon Sep 17 00:00:00 2001 From: Kulikov Vasiliy Date: Sat, 3 Jul 2010 20:03:55 +0400 Subject: [CPUFREQ] arch/x86/kernel/cpu/cpufreq: use for_each_pci_dev() Use for_each_pci_dev() to simplify the code. Signed-off-by: Kulikov Vasiliy Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 16e3483be9e3..8c3325fee77c 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -199,7 +199,7 @@ static __init struct pci_dev *gx_detect_chipset(void) } /* detect which companion chip is used */ - while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { + for_each_pci_dev(gx_pci) { if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) return gx_pci; } -- cgit v1.2.1 From 55c789bb2bcdcaa8f1f60687b4a9dbd02ffddd88 Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Thu, 15 Jul 2010 20:36:41 +0200 Subject: [CPUFREQ] Convert pci_table entries to PCI_VDEVICE (if PCI_ANY_ID is used) This patch converts pci_table entries, where .subvendor=PCI_ANY_ID and .subdevice=PCI_ANY_ID, .class=0 and .class_mask=0, to use the PCI_VDEVICE macro, and thus improves readability. Signed-off-by: Peter Huewe Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/gx-suspmod.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 8c3325fee77c..32974cf84232 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -169,12 +169,9 @@ static int gx_freq_mult[16] = { * Low Level chipset interface * ****************************************************************/ static struct pci_device_id gx_chipset_tbl[] __initdata = { - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, - PCI_ANY_ID, PCI_ANY_ID }, - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, - PCI_ANY_ID, PCI_ANY_ID }, - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, - PCI_ANY_ID, PCI_ANY_ID }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, { 0, }, }; -- cgit v1.2.1 From b30d3304c9c068ccfe6940232834768af75f8c9a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 8 Jul 2010 18:05:14 +0200 Subject: [CPUFREQ] powernow-k8: Fix misleading variable naming rdmsr() takes the lower 32 bits as a second argument and the high 32 as a third. Fix the names accordingly since they were swapped. There should be no functionality change resulting from this patch. Signed-off-by: Borislav Petkov Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index c48b44b3b43a..90cab2d4ac0d 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -912,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, { int i; u32 hi = 0, lo = 0; - rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); - data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; + rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); + data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; for (i = 0; i < data->acpi_data.state_count; i++) { u32 index; -- cgit v1.2.1 From 7e2d81122052c83feeddbebf706b6d53fba7996d Mon Sep 17 00:00:00 2001 From: Holger Freyther Date: Mon, 19 Jul 2010 03:28:49 +0800 Subject: [CPUFREQ] Fix section mismatch for longrun_cpu_init. Use __cpuinit instead of __init for the cpufreq_driver init function like it is done in powernow-k8.c. This is removing the warning generated when compiling with the CONFIG_DEBUG_SECTION_MISMATCH=y option. Signed-off-by: Holger Hans Peter Freyther Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/longrun.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index e7b559d74c52..fc09f142d94d 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu) * TMTA rules: * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) */ -static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, - unsigned int *high_freq) +static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, + unsigned int *high_freq) { u32 msr_lo, msr_hi; u32 save_lo, save_hi; @@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, } -static int __init longrun_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) { int result = 0; -- cgit v1.2.1 From 2530573e45c5846cd238db78651f0d236fc78aab Mon Sep 17 00:00:00 2001 From: Holger Freyther Date: Mon, 19 Jul 2010 03:29:03 +0800 Subject: [CPUFREQ] Fix section mismatch for longhaul_cpu_init. Use __cpuinit instead of __init for the cpufreq_driver init function like it is done in powernow-k8.c. Use the __cpuinitdata for data used by the routines marked as __cpuinit. This is removing the warning generated when compiling with the CONFIG_DEBUG_SECTION_MISMATCH=y option. Signed-off-by: Holger Hans Peter Freyther Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/longhaul.c | 6 +++--- arch/x86/kernel/cpu/cpufreq/longhaul.h | 26 +++++++++++++------------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 7e7eea4f8261..03162dac6271 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -426,7 +426,7 @@ static int guess_fsb(int mult) } -static int __init longhaul_get_ranges(void) +static int __cpuinit longhaul_get_ranges(void) { unsigned int i, j, k = 0; unsigned int ratio; @@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void) } -static void __init longhaul_setup_voltagescaling(void) +static void __cpuinit longhaul_setup_voltagescaling(void) { union msr_longhaul longhaul; struct mV_pos minvid, maxvid, vid; @@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void) return 0; } -static int __init longhaul_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) { struct cpuinfo_x86 *c = &cpu_data(0); char *cpuname = NULL; diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h index e2360a469f79..cbf48fbca881 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.h +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h @@ -56,7 +56,7 @@ union msr_longhaul { /* * VIA C3 Samuel 1 & Samuel 2 (stepping 0) */ -static const int __initdata samuel1_mults[16] = { +static const int __cpuinitdata samuel1_mults[16] = { -1, /* 0000 -> RESERVED */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = { -1, /* 1111 -> RESERVED */ }; -static const int __initdata samuel1_eblcr[16] = { +static const int __cpuinitdata samuel1_eblcr[16] = { 50, /* 0000 -> RESERVED */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = { /* * VIA C3 Samuel2 Stepping 1->15 */ -static const int __initdata samuel2_eblcr[16] = { +static const int __cpuinitdata samuel2_eblcr[16] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = { /* * VIA C3 Ezra */ -static const int __initdata ezra_mults[16] = { +static const int __cpuinitdata ezra_mults[16] = { 100, /* 0000 -> 10.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = { 120, /* 1111 -> 12.0x */ }; -static const int __initdata ezra_eblcr[16] = { +static const int __cpuinitdata ezra_eblcr[16] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = { /* * VIA C3 (Ezra-T) [C5M]. */ -static const int __initdata ezrat_mults[32] = { +static const int __cpuinitdata ezrat_mults[32] = { 100, /* 0000 -> 10.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = { -1, /* 1111 -> RESERVED (12.0x) */ }; -static const int __initdata ezrat_eblcr[32] = { +static const int __cpuinitdata ezrat_eblcr[32] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = { /* * VIA C3 Nehemiah */ -static const int __initdata nehemiah_mults[32] = { +static const int __cpuinitdata nehemiah_mults[32] = { 100, /* 0000 -> 10.0x */ -1, /* 0001 -> 16.0x */ 40, /* 0010 -> 4.0x */ @@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = { -1, /* 1111 -> 12.0x */ }; -static const int __initdata nehemiah_eblcr[32] = { +static const int __cpuinitdata nehemiah_eblcr[32] = { 50, /* 0000 -> 5.0x */ 160, /* 0001 -> 16.0x */ 40, /* 0010 -> 4.0x */ @@ -315,7 +315,7 @@ struct mV_pos { unsigned short pos; }; -static const struct mV_pos __initdata vrm85_mV[32] = { +static const struct mV_pos __cpuinitdata vrm85_mV[32] = { {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, @@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = { {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} }; -static const unsigned char __initdata mV_vrm85[32] = { +static const unsigned char __cpuinitdata mV_vrm85[32] = { 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 }; -static const struct mV_pos __initdata mobilevrm_mV[32] = { +static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, @@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = { {675, 3}, {650, 2}, {625, 1}, {600, 0} }; -static const unsigned char __initdata mV_mobilevrm[32] = { +static const unsigned char __cpuinitdata mV_mobilevrm[32] = { 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, -- cgit v1.2.1 From 307069cf6c53632adc27de4f49bf5d1d67cb87bb Mon Sep 17 00:00:00 2001 From: Holger Freyther Date: Mon, 19 Jul 2010 03:29:16 +0800 Subject: [CPUFREQ] Fix section mismatch for powernow_cpu_init in powernow-k7.c Use __cpuinit instead of __init for the cpufreq_driver init function like it is done in powernow-k8.c. This is removing the warning generated when compiling with the CONFIG_DEBUG_SECTION_MISMATCH=y option. Signed-off-by: Holger Hans Peter Freyther Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 9a97116f89e5..4a45fd6e41ba 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy) * We will then get the same kind of behaviour already tested under * the "well-known" other OS. */ -static int __init fixup_sgtc(void) +static int __cpuinit fixup_sgtc(void) { unsigned int sgtc; unsigned int m; @@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu) } -static int __init acer_cpufreq_pst(const struct dmi_system_id *d) +static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) { printk(KERN_WARNING PFX "%s laptop with broken PST tables in BIOS detected.\n", @@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d) * A BIOS update is all that can save them. * Mention this, and disable cpufreq. */ -static struct dmi_system_id __initdata powernow_dmi_table[] = { +static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { { .callback = acer_cpufreq_pst, .ident = "Acer Aspire", @@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = { { } }; -static int __init powernow_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) { union msr_fidvidstatus fidvidstatus; int result; -- cgit v1.2.1 From 9d1f44ee206a23b975d7d7c6f759efb25e0e61ac Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 3 Aug 2010 13:47:30 -0400 Subject: [CPUFREQ] Remove pointless printk from p4-clockmod. The only machines this is triggering on should be supported by acpi-cpufreq or acpi's internal throttling. Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 7b8a8ba67b07..bd1cac747f67 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) } } - if (c->x86 != 0xF) { - if (!cpu_has(c, X86_FEATURE_EST)) - printk(KERN_WARNING PFX "Unknown CPU. " - "Please send an e-mail to " - "\n"); + if (c->x86 != 0xF) return 0; - } /* on P-4s, the TSC runs with constant frequency independent whether * throttling is active or not. */ -- cgit v1.2.1 From 55d435a227bd28c77afab326de44dfacc0b15059 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 29 Jul 2010 17:13:44 -0700 Subject: x86, hwmon: Package Level Thermal/Power: thermal throttling handler Add package level thermal throttle interrupt support. The interrupt handler increases package level thermal throttle count. It also logs the event in MCE log. The package level thermal throttle interrupt happens across threads in a package. Each thread handles the interrupt individually. User level application is supposed to retrieve correct event count and log based on package/thread topology. This is the same situation for core level interrupt handler. In the future, interrupt may be reported only per package or per core. core_throttle_count and package_throttle_count are used for user interface. Previously only throttle_count is used for core throttle count. If you think new core_throttle_count name breaks user interface, I can change this part. Signed-off-by: Fenghua Yu LKML-Reference: <1280448826-12004-4-git-send-email-fenghua.yu@intel.com> Reviewed-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 89 +++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index e1a0a3bf9716..d307f9f64c23 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -37,7 +37,7 @@ /* * Current thermal throttling state: */ -struct thermal_state { +struct _thermal_state { bool is_throttled; u64 next_check; @@ -45,6 +45,11 @@ struct thermal_state { unsigned long last_throttle_count; }; +struct thermal_state { + struct _thermal_state core; + struct _thermal_state package; +}; + static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -53,11 +58,13 @@ static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + static SYSDEV_ATTR(_name, 0444, \ + therm_throt_sysdev_show_##_name, \ + NULL) \ -#define define_therm_throt_sysdev_show_func(name) \ +#define define_therm_throt_sysdev_show_func(level, name) \ \ -static ssize_t therm_throt_sysdev_show_##name( \ +static ssize_t therm_throt_sysdev_show_##level##_##name( \ struct sys_device *dev, \ struct sysdev_attribute *attr, \ char *buf) \ @@ -66,21 +73,24 @@ static ssize_t therm_throt_sysdev_show_##name( \ ssize_t ret; \ \ preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) \ + if (cpu_online(cpu)) { \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).name); \ - else \ + per_cpu(thermal_state, cpu).level.name); \ + } else \ ret = 0; \ preempt_enable(); \ \ return ret; \ } -define_therm_throt_sysdev_show_func(throttle_count); -define_therm_throt_sysdev_one_ro(throttle_count); +define_therm_throt_sysdev_show_func(core, throttle_count); +define_therm_throt_sysdev_one_ro(core_throttle_count); + +define_therm_throt_sysdev_show_func(package, throttle_count); +define_therm_throt_sysdev_one_ro(package_throttle_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_throttle_count.attr, + &attr_core_throttle_count.attr, NULL }; @@ -106,16 +116,21 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -static int therm_throt_process(bool is_throttled) +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 +static int therm_throt_process(bool is_throttled, int level) { - struct thermal_state *state; + struct _thermal_state *state; unsigned int this_cpu; bool was_throttled; u64 now; this_cpu = smp_processor_id(); now = get_jiffies_64(); - state = &per_cpu(thermal_state, this_cpu); + if (level == CORE_LEVEL) + state = &per_cpu(thermal_state, this_cpu).core; + else + state = &per_cpu(thermal_state, this_cpu).package; was_throttled = state->is_throttled; state->is_throttled = is_throttled; @@ -132,13 +147,18 @@ static int therm_throt_process(bool is_throttled) /* if we just entered the thermal event */ if (is_throttled) { - printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->throttle_count); add_taint(TAINT_MACHINE_CHECK); return 1; } if (was_throttled) { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); + printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); return 1; } @@ -149,8 +169,19 @@ static int therm_throt_process(bool is_throttled) /* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) { - return sysfs_create_group(&sys_dev->kobj, - &thermal_throttle_attr_group); + int err; + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + if (err) + return err; + + if (cpu_has(c, X86_FEATURE_PTS)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_throttle_count.attr, + thermal_throttle_attr_group.name); + + return err; } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) @@ -230,10 +261,25 @@ device_initcall(thermal_throttle_init_device); static void intel_thermal_interrupt(void) { __u64 msr_val; + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + CORE_LEVEL) != 0) mce_log_therm_throt_event(msr_val); + + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + PACKAGE_LEVEL) != 0) + /* + * Set up the most significant bit to notify mce log + * that this thermal event is a package level event. + * This is a temp solution. May be changed in the future + * with mce log infrasture. + */ + mce_log_therm_throt_event(((__u64)1 << 63) | msr_val); + } } static void unexpected_thermal_interrupt(void) @@ -338,6 +384,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_THERM_INTERRUPT, l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); + } + smp_thermal_vector = intel_thermal_interrupt; rdmsr(MSR_IA32_MISC_ENABLE, l, h); -- cgit v1.2.1 From 0199114c31798af5b83841b21759b64171060d9b Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 29 Jul 2010 17:13:45 -0700 Subject: x86, hwmon: Package Level Thermal/Power: power limit Power limit notification feature is published in Intel 64 and IA-32 Architectures SDMV Vol 3A 14.5.6 Power Limit Notification. It is implemented first on Intel Sandy Bridge platform. The patch handles notification interrupt. Interrupt handler dumps power limit information in log_buf, logs the event in mce log, and increases the event counters (core_power_limit and package_power_limit). Upper level applications could use the data to detect system health or diagnose functionality/performance issues. In the future, the event could be handled in a more fancy way. Signed-off-by: Fenghua Yu LKML-Reference: <1280448826-12004-5-git-send-email-fenghua.yu@intel.com> Reviewed-by: Len Brown Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 183 ++++++++++++++++++++++--------- 1 file changed, 129 insertions(+), 54 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d307f9f64c23..c2a8b26d4fea 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -34,20 +34,25 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) +#define THERMAL_THROTTLING_EVENT 0 +#define POWER_LIMIT_EVENT 1 + /* - * Current thermal throttling state: + * Current thermal event state: */ struct _thermal_state { - bool is_throttled; - + bool new_event; + int event; u64 next_check; - unsigned long throttle_count; - unsigned long last_throttle_count; + unsigned long count; + unsigned long last_count; }; struct thermal_state { - struct _thermal_state core; - struct _thermal_state package; + struct _thermal_state core_throttle; + struct _thermal_state core_power_limit; + struct _thermal_state package_throttle; + struct _thermal_state package_power_limit; }; static DEFINE_PER_CPU(struct thermal_state, thermal_state); @@ -62,9 +67,9 @@ static u32 lvtthmr_init __read_mostly; therm_throt_sysdev_show_##_name, \ NULL) \ -#define define_therm_throt_sysdev_show_func(level, name) \ +#define define_therm_throt_sysdev_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##level##_##name( \ +static ssize_t therm_throt_sysdev_show_##event##_##name( \ struct sys_device *dev, \ struct sysdev_attribute *attr, \ char *buf) \ @@ -75,7 +80,7 @@ static ssize_t therm_throt_sysdev_show_##level##_##name( \ preempt_disable(); /* CPU hotplug */ \ if (cpu_online(cpu)) { \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).level.name); \ + per_cpu(thermal_state, cpu).event.name); \ } else \ ret = 0; \ preempt_enable(); \ @@ -83,23 +88,32 @@ static ssize_t therm_throt_sysdev_show_##level##_##name( \ return ret; \ } -define_therm_throt_sysdev_show_func(core, throttle_count); +define_therm_throt_sysdev_show_func(core_throttle, count); define_therm_throt_sysdev_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(package, throttle_count); +define_therm_throt_sysdev_show_func(core_power_limit, count); +define_therm_throt_sysdev_one_ro(core_power_limit_count); + +define_therm_throt_sysdev_show_func(package_throttle, count); define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_sysdev_show_func(package_power_limit, count); +define_therm_throt_sysdev_one_ro(package_power_limit_count); + static struct attribute *thermal_throttle_attrs[] = { &attr_core_throttle_count.attr, NULL }; -static struct attribute_group thermal_throttle_attr_group = { +static struct attribute_group thermal_attr_group = { .attrs = thermal_throttle_attrs, .name = "thermal_throttle" }; #endif /* CONFIG_SYSFS */ +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 + /*** * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the @@ -116,49 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -#define CORE_LEVEL 0 -#define PACKAGE_LEVEL 1 -static int therm_throt_process(bool is_throttled, int level) +static int therm_throt_process(bool new_event, int event, int level) { struct _thermal_state *state; - unsigned int this_cpu; - bool was_throttled; + unsigned int this_cpu = smp_processor_id(); + bool old_event; u64 now; + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - this_cpu = smp_processor_id(); now = get_jiffies_64(); - if (level == CORE_LEVEL) - state = &per_cpu(thermal_state, this_cpu).core; - else - state = &per_cpu(thermal_state, this_cpu).package; + if (level == CORE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->core_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->core_power_limit; + else + return 0; + } else if (level == PACKAGE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->package_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->package_power_limit; + else + return 0; + } else + return 0; - was_throttled = state->is_throttled; - state->is_throttled = is_throttled; + old_event = state->new_event; + state->new_event = new_event; - if (is_throttled) - state->throttle_count++; + if (new_event) + state->count++; if (time_before64(now, state->next_check) && - state->throttle_count != state->last_throttle_count) + state->count != state->last_count) return 0; state->next_check = now + CHECK_INTERVAL; - state->last_throttle_count = state->throttle_count; + state->last_count = state->count; /* if we just entered the thermal event */ - if (is_throttled) { - printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package", - state->throttle_count); + if (new_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); + else + printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); add_taint(TAINT_MACHINE_CHECK); return 1; } - if (was_throttled) { - printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package"); + if (old_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); + else + printk(KERN_INFO "CPU%d: %s power limit normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); return 1; } @@ -172,21 +207,29 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) int err; struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - err = sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); if (err) return err; + if (cpu_has(c, X86_FEATURE_PLN)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_core_power_limit_count.attr, + thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PTS)) err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_package_throttle_count.attr, - thermal_throttle_attr_group.name); + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PLN)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_power_limit_count.attr, + thermal_attr_group.name); return err; } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); } /* Mutex protecting device creation against CPU hotplug: */ @@ -257,6 +300,17 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ +/* + * Set up the most two significant bit to notify mce log that this thermal + * event type. + * This is a temp solution. May be changed in the future with mce log + * infrasture. + */ +#define CORE_THROTTLED (0) +#define CORE_POWER_LIMIT ((__u64)1 << 62) +#define PACKAGE_THROTTLED ((__u64)2 << 62) +#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) + /* Thermal transition interrupt handler */ static void intel_thermal_interrupt(void) { @@ -264,21 +318,31 @@ static void intel_thermal_interrupt(void) struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) - mce_log_therm_throt_event(msr_val); + mce_log_therm_throt_event(CORE_THROTTLED | msr_val); + + if (cpu_has(c, X86_FEATURE_PLN)) + if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + CORE_LEVEL) != 0) + mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); if (cpu_has(c, X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, PACKAGE_LEVEL) != 0) - /* - * Set up the most significant bit to notify mce log - * that this thermal event is a package level event. - * This is a temp solution. May be changed in the future - * with mce log infrasture. - */ - mce_log_therm_throt_event(((__u64)1 << 63) | msr_val); + mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); + if (cpu_has(c, X86_FEATURE_PLN)) + if (therm_throt_process(msr_val & + PACKAGE_THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + PACKAGE_LEVEL) != 0) + mce_log_therm_throt_event(PACKAGE_POWER_LIMIT + | msr_val); } } @@ -381,14 +445,25 @@ void intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PLN)) + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); if (cpu_has(c, X86_FEATURE_PTS)) { rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE - | PACKAGE_THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PLN)) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE + | PACKAGE_THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); } smp_thermal_vector = intel_thermal_interrupt; -- cgit v1.2.1 From 12bfa3de63504d879ae427ec1f2884fc46556157 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 5 Aug 2010 09:22:20 -0500 Subject: kgdb,x86: Individual register get/set for x86 Implement the ability to individually get and set registers for kdb and kgdb for x86. Signed-off-by: Jason Wessel Acked-by: H. Peter Anvin CC: Ingo Molnar CC: x86@kernel.org --- arch/x86/kernel/kgdb.c | 168 ++++++++++++++++++++++++------------------------- 1 file changed, 83 insertions(+), 85 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 01ab17ae2ae7..bae89825e14e 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -49,55 +49,94 @@ #include #include -/** - * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs - * @gdb_regs: A pointer to hold the registers in the order GDB wants. - * @regs: The &struct pt_regs of the current process. - * - * Convert the pt_regs in @regs into the format for registers that - * GDB expects, stored in @gdb_regs. - */ -void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { -#ifndef CONFIG_X86_32 - u32 *gdb_regs32 = (u32 *)gdb_regs; +#ifdef CONFIG_X86_32 + { "ax", 4, offsetof(struct pt_regs, ax) }, + { "cx", 4, offsetof(struct pt_regs, cx) }, + { "dx", 4, offsetof(struct pt_regs, dx) }, + { "bx", 4, offsetof(struct pt_regs, bx) }, + { "sp", 4, offsetof(struct pt_regs, sp) }, + { "bp", 4, offsetof(struct pt_regs, bp) }, + { "si", 4, offsetof(struct pt_regs, si) }, + { "di", 4, offsetof(struct pt_regs, di) }, + { "ip", 4, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, offsetof(struct pt_regs, ds) }, + { "es", 4, offsetof(struct pt_regs, es) }, + { "fs", 4, -1 }, + { "gs", 4, -1 }, +#else + { "ax", 8, offsetof(struct pt_regs, ax) }, + { "bx", 8, offsetof(struct pt_regs, bx) }, + { "cx", 8, offsetof(struct pt_regs, cx) }, + { "dx", 8, offsetof(struct pt_regs, dx) }, + { "si", 8, offsetof(struct pt_regs, dx) }, + { "di", 8, offsetof(struct pt_regs, di) }, + { "bp", 8, offsetof(struct pt_regs, bp) }, + { "sp", 8, offsetof(struct pt_regs, sp) }, + { "r8", 8, offsetof(struct pt_regs, r8) }, + { "r9", 8, offsetof(struct pt_regs, r9) }, + { "r10", 8, offsetof(struct pt_regs, r10) }, + { "r11", 8, offsetof(struct pt_regs, r11) }, + { "r12", 8, offsetof(struct pt_regs, r12) }, + { "r13", 8, offsetof(struct pt_regs, r13) }, + { "r14", 8, offsetof(struct pt_regs, r14) }, + { "r15", 8, offsetof(struct pt_regs, r15) }, + { "ip", 8, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, #endif - gdb_regs[GDB_AX] = regs->ax; - gdb_regs[GDB_BX] = regs->bx; - gdb_regs[GDB_CX] = regs->cx; - gdb_regs[GDB_DX] = regs->dx; - gdb_regs[GDB_SI] = regs->si; - gdb_regs[GDB_DI] = regs->di; - gdb_regs[GDB_BP] = regs->bp; - gdb_regs[GDB_PC] = regs->ip; +}; + +int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) +{ + if ( #ifdef CONFIG_X86_32 - gdb_regs[GDB_PS] = regs->flags; - gdb_regs[GDB_DS] = regs->ds; - gdb_regs[GDB_ES] = regs->es; - gdb_regs[GDB_CS] = regs->cs; - gdb_regs[GDB_FS] = 0xFFFF; - gdb_regs[GDB_GS] = 0xFFFF; - if (user_mode_vm(regs)) { - gdb_regs[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = regs->sp; - } else { - gdb_regs[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + regno == GDB_SS || regno == GDB_FS || regno == GDB_GS || +#endif + regno == GDB_SP || regno == GDB_ORIG_AX) + return 0; + + if (dbg_reg_def[regno].offset != -1) + memcpy((void *)regs + dbg_reg_def[regno].offset, mem, + dbg_reg_def[regno].size); + return 0; +} + +char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno == GDB_ORIG_AX) { + memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax)); + return "orig_ax"; } -#else - gdb_regs[GDB_R8] = regs->r8; - gdb_regs[GDB_R9] = regs->r9; - gdb_regs[GDB_R10] = regs->r10; - gdb_regs[GDB_R11] = regs->r11; - gdb_regs[GDB_R12] = regs->r12; - gdb_regs[GDB_R13] = regs->r13; - gdb_regs[GDB_R14] = regs->r14; - gdb_regs[GDB_R15] = regs->r15; - gdb_regs32[GDB_PS] = regs->flags; - gdb_regs32[GDB_CS] = regs->cs; - gdb_regs32[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return NULL; + + if (dbg_reg_def[regno].offset != -1) + memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, + dbg_reg_def[regno].size); + + switch (regno) { +#ifdef CONFIG_X86_32 + case GDB_SS: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = __KERNEL_DS; + break; + case GDB_SP: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = kernel_stack_pointer(regs); + break; + case GDB_GS: + case GDB_FS: + *(unsigned long *)mem = 0xFFFF; + break; #endif + } + return dbg_reg_def[regno].name; } /** @@ -150,47 +189,6 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_SP] = p->thread.sp; } -/** - * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs. - * @gdb_regs: A pointer to hold the registers we've received from GDB. - * @regs: A pointer to a &struct pt_regs to hold these values in. - * - * Convert the GDB regs in @gdb_regs into the pt_regs, and store them - * in @regs. - */ -void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) -{ -#ifndef CONFIG_X86_32 - u32 *gdb_regs32 = (u32 *)gdb_regs; -#endif - regs->ax = gdb_regs[GDB_AX]; - regs->bx = gdb_regs[GDB_BX]; - regs->cx = gdb_regs[GDB_CX]; - regs->dx = gdb_regs[GDB_DX]; - regs->si = gdb_regs[GDB_SI]; - regs->di = gdb_regs[GDB_DI]; - regs->bp = gdb_regs[GDB_BP]; - regs->ip = gdb_regs[GDB_PC]; -#ifdef CONFIG_X86_32 - regs->flags = gdb_regs[GDB_PS]; - regs->ds = gdb_regs[GDB_DS]; - regs->es = gdb_regs[GDB_ES]; - regs->cs = gdb_regs[GDB_CS]; -#else - regs->r8 = gdb_regs[GDB_R8]; - regs->r9 = gdb_regs[GDB_R9]; - regs->r10 = gdb_regs[GDB_R10]; - regs->r11 = gdb_regs[GDB_R11]; - regs->r12 = gdb_regs[GDB_R12]; - regs->r13 = gdb_regs[GDB_R13]; - regs->r14 = gdb_regs[GDB_R14]; - regs->r15 = gdb_regs[GDB_R15]; - regs->flags = gdb_regs32[GDB_PS]; - regs->cs = gdb_regs32[GDB_CS]; - regs->ss = gdb_regs32[GDB_SS]; -#endif -} - static struct hw_breakpoint { unsigned enabled; unsigned long addr; -- cgit v1.2.1 From 9264b278be42c031dc76517a0d4bb154f5dcf470 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 5 Aug 2010 09:22:24 -0500 Subject: KGDB: Remove set but unused newPC Found by gcc 4.6's new warnings Signed-off-by: Andi Kleen Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index bae89825e14e..a8b80979ceb8 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -456,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, { unsigned long addr; char *ptr; - int newPC; switch (remcomInBuffer[0]) { case 'c': @@ -467,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, linux_regs->ip = addr; case 'D': case 'k': - newPC = linux_regs->ip; - /* clear the trace bit */ linux_regs->flags &= ~X86_EFLAGS_TF; atomic_set(&kgdb_cpu_doing_single_step, -1); -- cgit v1.2.1 From df4939350b345ebb44937902827aa75b8ad4998c Mon Sep 17 00:00:00 2001 From: Dongdong Deng Date: Thu, 5 Aug 2010 09:22:25 -0500 Subject: kgdb,x86: use macro HBP_NUM to replace magic number 4 Use the macros provided by the HW breakpoint API. Signed-off-by: Dongdong Deng Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index a8b80979ceb8..ef10940e1af0 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -195,7 +195,7 @@ static struct hw_breakpoint { int len; int type; struct perf_event **pev; -} breakinfo[4]; +} breakinfo[HBP_NUM]; static unsigned long early_dr7; @@ -203,7 +203,7 @@ static void kgdb_correct_hw_break(void) { int breakno; - for (breakno = 0; breakno < 4; breakno++) { + for (breakno = 0; breakno < HBP_NUM; breakno++) { struct perf_event *bp; struct arch_hw_breakpoint *info; int val; @@ -290,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { int i; - for (i = 0; i < 4; i++) + for (i = 0; i < HBP_NUM; i++) if (breakinfo[i].addr == addr && breakinfo[i].enabled) break; - if (i == 4) + if (i == HBP_NUM) return -1; if (hw_break_release_slot(i)) { @@ -311,7 +311,7 @@ static void kgdb_remove_all_hw_break(void) int cpu = raw_smp_processor_id(); struct perf_event *bp; - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; bp = *per_cpu_ptr(breakinfo[i].pev, cpu); @@ -331,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { int i; - for (i = 0; i < 4; i++) + for (i = 0; i < HBP_NUM; i++) if (!breakinfo[i].enabled) break; - if (i == 4) + if (i == HBP_NUM) return -1; switch (bptype) { @@ -395,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) /* Disable hardware debugging while we are in kgdb: */ set_debugreg(0UL, 7); - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; if (dbg_is_early) { @@ -640,7 +640,7 @@ void kgdb_arch_late(void) attr.bp_len = HW_BREAKPOINT_LEN_1; attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (breakinfo[i].pev) continue; breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); -- cgit v1.2.1 From 5989cd6a1cbf86587edcc856791f960978087311 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Aug 2010 13:30:27 -0700 Subject: x86, apic: Map the local apic when parsing the MP table. This fixes a regression in 2.6.35 from 2.6.34, that is present for select models of Intel cpus when people are using an MP table. The commit cf7500c0ea133d66f8449d86392d83f840102632 "x86, ioapic: In mpparse use mp_register_ioapic" started calling mp_register_ioapic from MP_ioapic_info. An extremely simple change that was obviously correct. Unfortunately mp_register_ioapic did just a little more than the previous hand crafted code and so we gained this call path. The problem call path is: MP_ioapic_info() mp_register_ioapic() io_apic_unique_id() io_apic_get_unique_id() get_physical_broadcast() modern_apic() lapic_get_version() apic_read(APIC_LVR) Which turned out to be a problem because the local apic was not mapped, at that point, unlike the similar point in the ACPI parsing code. This problem is fixed by mapping the local apic when parsing the mptable as soon as we reasonably can. Looking at the number of places we setup the fixmap for the local apic, I see some serious simplification opportunities. For the moment except for not duplicating the setting up of the fixmap in init_apic_mappings, I have not acted on them. The regression from 2.6.34 is tracked in bug https://bugzilla.kernel.org/show_bug.cgi?id=16173 Cc: 2.6.35 Reported-by: David Hill Reported-by: Tvrtko Ursulin Tested-by: Tvrtko Ursulin Signed-off-by: Eric W. Biederman LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/mpparse.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a96489ee6cab..c07e51391a3f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1606,7 +1606,7 @@ void __init init_apic_mappings(void) * acpi lapic path already maps that address in * acpi_register_lapic_address() */ - if (!acpi_lapic) + if (!acpi_lapic && !smp_found_config) set_fixmap_nocache(FIX_APIC_BASE, apic_phys); apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d86dbf7e54be..d7b6f7fb4fec 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } +static void __init smp_register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, address); + if (boot_cpu_physical_apicid == -1U) { + boot_cpu_physical_apicid = read_apic_id(); + apic_version[boot_cpu_physical_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); + } +} + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (early) return 1; + /* Initialize the lapic mapping */ + if (!acpi_lapic) + smp_register_lapic_address(mpc->lapic); + if (mpc->oemptr) x86_init.mpparse.smp_read_mpc_oem(mpc); -- cgit v1.2.1 From ad4ecef2f13c790f95b55320f2925c205d8f971f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 2 Aug 2010 15:48:23 +0800 Subject: ACPI, APEI, Rename CPER and GHES severity constants The abbreviation of severity should be SEV instead of SER, so the CPER severity constants are renamed accordingly. GHES severity constants are renamed in the same way too. Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: Len Brown --- arch/x86/kernel/cpu/mcheck/mce-apei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 745b54f9be89..8209472b27a5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -80,7 +80,7 @@ int apei_write_mce(struct mce *m) rcd.hdr.revision = CPER_RECORD_REV; rcd.hdr.signature_end = CPER_SIG_END; rcd.hdr.section_count = 1; - rcd.hdr.error_severity = CPER_SER_FATAL; + rcd.hdr.error_severity = CPER_SEV_FATAL; /* timestamp, platform_id, partition_id are all invalid */ rcd.hdr.validation_bits = 0; rcd.hdr.record_length = sizeof(rcd); @@ -96,7 +96,7 @@ int apei_write_mce(struct mce *m) rcd.sec_hdr.validation_bits = 0; rcd.sec_hdr.flags = CPER_SEC_PRIMARY; rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; - rcd.sec_hdr.section_severity = CPER_SER_FATAL; + rcd.sec_hdr.section_severity = CPER_SEV_FATAL; memcpy(&rcd.mce, m, sizeof(*m)); -- cgit v1.2.1 From 1c250d709fdc8aa5bf42d90be99428a01a256a55 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 5 Aug 2010 19:09:17 +0400 Subject: perf, x86: P4 PMU -- update nmi irq statistics and unmask lvt entry properly In case if last active performance counter is not overflowed at moment of NMI being triggered by another counter, the irq statistics may miss an update stage. As a more serious consequence -- apic quirk may not be triggered so apic lvt entry stay masked. Tested-by: Lin Ming Signed-off-by: Cyrill Gorcunov Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Frederic Weisbecker LKML-Reference: <20100805150917.GA6311@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 107711bf0ee8..febb12cea795 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -656,6 +656,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { + int overflow; if (!test_bit(idx, cpuc->active_mask)) continue; @@ -666,12 +667,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) WARN_ON_ONCE(hwc->idx != idx); /* it might be unflagged overflow */ - handled = p4_pmu_clear_cccr_ovf(hwc); + overflow = p4_pmu_clear_cccr_ovf(hwc); val = x86_perf_event_update(event); - if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) + if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) continue; + handled += overflow; + /* event overflow for sure */ data.period = event->hw.last_period; @@ -687,7 +690,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) inc_irq_stat(apic_perf_irqs); } - return handled; + return handled > 0; } /* -- cgit v1.2.1 From d7a7c573936a86474c4a5090a45a4bc6e680c117 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 9 Aug 2010 17:20:33 -0700 Subject: x86, ia64, smp: use workqueues unconditionally during do_boot_cpu() Workqueues are now initialized as part of the early_initcall(). So they are available for use during cold boot process aswell. Signed-off-by: Suresh Siddha Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Tejun Heo Cc: Oleg Nesterov Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/smpboot.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 51620953b18a..a5e928b0cb5f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) goto do_rest; } - if (!keventd_up()) - c_idle.work.func(&c_idle.work); - else { - schedule_work(&c_idle.work); - wait_for_completion(&c_idle.done); - } + schedule_work(&c_idle.work); + wait_for_completion(&c_idle.done); if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); -- cgit v1.2.1 From 8cbd84f2dd4e52a8771b191030c374ba3e56d291 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 10 Aug 2010 15:35:10 -0700 Subject: x86: fix up system call numbering nit As pointed out by Jiri Slaby: when I resolved the the 32-bit x85 system call entry tables for prlimit (due to the conflict with fanotify), I forgot to add the numbering in comments that we do for every fifth entry. Reported-by: Jiri Slaby Signed-off-by: Linus Torvalds --- arch/x86/kernel/syscall_table_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 4802accb9d8d..b35786dc9b8f 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -339,4 +339,4 @@ ENTRY(sys_call_table) .long sys_recvmmsg .long sys_fanotify_init .long sys_fanotify_mark - .long sys_prlimit64 + .long sys_prlimit64 /* 340 */ -- cgit v1.2.1 From a3da323420d5aa6f7bd15efc7bf34cd6d19e1f1a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 8 Aug 2010 02:37:23 +0900 Subject: [CPUFREQ] add missing __percpu markup in pcc-cpufreq.c pcc_cpu_info is a percpu pointer but was missing __percpu markup. Add it. Signed-off-by: Namhyung Kim Acked-by: Tejun Heo Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index a36de5bbb622..994230d4dc4e 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -110,7 +110,7 @@ struct pcc_cpu { u32 output_offset; }; -static struct pcc_cpu *pcc_cpu_info; +static struct pcc_cpu __percpu *pcc_cpu_info; static int pcc_cpufreq_verify(struct cpufreq_policy *policy) { -- cgit v1.2.1 From 4936a3b90d79dd8775c6ac23c2cf2dcebe29abde Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 9 Aug 2010 14:20:10 -0700 Subject: x86/hpet: Use the FSEC_PER_SEC constant for femto-second periods The current computation, introduced with f12a15be63, of FSEC_PER_SEC using the multiplication of (FSEC_PER_NSEC * NSEC_PER_SEC) is performed only with 32bit integers on small machines, resulting in an overflow and a *very* short intervals being programmed. An interrupt storm follows. Note that we also have to specify FSEC_PER_SEC as being long long to overcome the same limitations. Signed-off-by: Chris Wilson Signed-off-by: John Stultz Cc: Thomas Gleixner Acked-by: Ingo Molnar Acked-by: H. Peter Anvin Signed-off-by: Linus Torvalds --- arch/x86/kernel/hpet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 33dbcc4ec5ff..351f9c0fea1f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -582,7 +582,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) * scaled math multiplication factor for nanosecond to hpet tick * conversion. */ - hpet_freq = 1000000000000000ULL; + hpet_freq = FSEC_PER_SEC; do_div(hpet_freq, hpet_period); evt->mult = div_sc((unsigned long) hpet_freq, NSEC_PER_SEC, evt->shift); @@ -837,7 +837,7 @@ static int hpet_clocksource_register(void) * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period */ - hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC; + hpet_freq = FSEC_PER_SEC; do_div(hpet_freq, hpet_period); clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); -- cgit v1.2.1 From 3f6c4df7e1f05f2fdb3f20cac9c03f595a72b45d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 13 Aug 2010 23:00:11 +0900 Subject: [CPUFREQ] acpi-cpufreq: add missing __percpu markup acpi_perf_data is a percpu pointer but was missing __percpu markup. Add it. Signed-off-by: Namhyung Kim Acked-by: Tejun Heo Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 246cd3afbb5f..cd8da247dda1 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -72,7 +72,7 @@ struct acpi_cpufreq_data { static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); /* acpi_perf_data is a pointer to percpu data. */ -static struct acpi_processor_performance *acpi_perf_data; +static struct acpi_processor_performance __percpu *acpi_perf_data; static struct cpufreq_driver acpi_cpufreq_driver; -- cgit v1.2.1 From c7887325230aec47d47a32562a6e26014a0fafca Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 11 Aug 2010 11:26:22 +0100 Subject: Mark arguments to certain syscalls as being const Mark arguments to certain system calls as being const where they should be but aren't. The list includes: (*) The filename arguments of various stat syscalls, execve(), various utimes syscalls and some mount syscalls. (*) The filename arguments of some syscall helpers relating to the above. (*) The buffer argument of various write syscalls. Signed-off-by: David Howells Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/process.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c5ea5cdbe7b3..17be5ec7cbba 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1185,13 +1185,13 @@ END(kernel_thread_helper) * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. * * C extern interface: - * extern long execve(char *name, char **argv, char **envp) + * extern long execve(const char *name, char **argv, char **envp) * * asm input arguments: * rdi: name, rsi: argv, rdx: envp * * We want to fallback into: - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) + * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs) * * do_sys_execve asm fallback arguments: * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d401f1d2d06e..64ecaf0af9af 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -301,7 +301,7 @@ EXPORT_SYMBOL(kernel_thread); /* * sys_execve() executes a new program. */ -long sys_execve(char __user *name, char __user * __user *argv, +long sys_execve(const char __user *name, char __user * __user *argv, char __user * __user *envp, struct pt_regs *regs) { long error; -- cgit v1.2.1 From f45755b8346f1a089ca7957dce5f4c3c6cb26e6f Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Fri, 13 Aug 2010 15:19:11 +0800 Subject: KVM: fix poison overwritten caused by using wrong xstate size fpu.state is allocated from task_xstate_cachep, the size of task_xstate_cachep is xstate_size. xstate_size is set from cpuid instruction, which is often smaller than sizeof(struct xsave_struct). kvm is using sizeof(struct xsave_struct) to fill in/out fpu.state.xsave, as what we allocated for fpu.state is xstate_size, kernel will write out of memory and caused poison/redzone/padding overwritten warnings. Signed-off-by: Xiaotian Feng Reviewed-by: Sheng Yang Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Suresh Siddha Cc: Brian Gerst Cc: Avi Kivity Cc: Robert Richter Cc: Sheng Yang Cc: Marcelo Tosatti Cc: Gleb Natapov Cc: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kernel/i387.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index c4444bce8469..f855658d2784 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -40,6 +40,7 @@ static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; +EXPORT_SYMBOL_GPL(xstate_size); unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; -- cgit v1.2.1 From 8c8aefce934dc45de641fe78d48ff1b7722d826a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 7 Aug 2010 11:00:59 -0700 Subject: kgdb: add missing __percpu markup in arch/x86/kernel/kgdb.c breakinfo->pev is a pointer to percpu pointer but was missing __percpu markup. Add it. Signed-off-by: Namhyung Kim Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index ef10940e1af0..852b81967a37 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -194,7 +194,7 @@ static struct hw_breakpoint { unsigned long addr; int len; int type; - struct perf_event **pev; + struct perf_event * __percpu *pev; } breakinfo[HBP_NUM]; static unsigned long early_dr7; -- cgit v1.2.1 From d7627467b7a8dd6944885290a03a07ceb28c10eb Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 17 Aug 2010 23:52:56 +0100 Subject: Make do_execve() take a const filename pointer Make do_execve() take a const filename pointer so that kernel_execve() compiles correctly on ARM: arch/arm/kernel/sys_arm.c:88: warning: passing argument 1 of 'do_execve' discards qualifiers from pointer target type This also requires the argv and envp arguments to be consted twice, once for the pointer array and once for the strings the array points to. This is because do_execve() passes a pointer to the filename (now const) to copy_strings_kernel(). A simpler alternative would be to cast the filename pointer in do_execve() when it's passed to copy_strings_kernel(). do_execve() may not change any of the strings it is passed as part of the argv or envp lists as they are some of them in .rodata, so marking these strings as const should be fine. Further kernel_execve() and sys_execve() need to be changed to match. This has been test built on x86_64, frv, arm and mips. Signed-off-by: David Howells Tested-by: Ralf Baechle Acked-by: Russell King Signed-off-by: Linus Torvalds --- arch/x86/kernel/process.c | 5 +++-- arch/x86/kernel/sys_i386_32.c | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 64ecaf0af9af..57d1868a86aa 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -301,8 +301,9 @@ EXPORT_SYMBOL(kernel_thread); /* * sys_execve() executes a new program. */ -long sys_execve(const char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs *regs) +long sys_execve(const char __user *name, + const char __user *const __user *argv, + const char __user *const __user *envp, struct pt_regs *regs) { long error; char *filename; diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 196552bb412c..d5e06624e34a 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -28,7 +28,9 @@ * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { long __res; asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" -- cgit v1.2.1 From 351af0725e5222e35741011d1ea62215c1ed06db Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Fri, 6 Aug 2010 13:39:08 +0800 Subject: perf, x86: Fix Intel-nhm PMU programming errata workaround Fix the Errata AAK100/AAP53/BD53 workaround, the officialy documented workaround we implemented in: 11164cd: perf, x86: Add Nehelem PMU programming errata workaround doesn't actually work fully and causes a stuck PMU state under load and non-functioning perf profiling. A functional workaround was found by trial & error. Affects all Nehalem-class Intel PMUs. Signed-off-by: Zhang Yanmin Signed-off-by: Peter Zijlstra LKML-Reference: <1281073148.2125.63.camel@ymzhang.sh.intel.com> Cc: Arjan van de Ven Cc: "H. Peter Anvin" Cc: # .35.x Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 81 ++++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 214ac860ebe0..d8d86d014008 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -491,33 +491,78 @@ static void intel_pmu_enable_all(int added) * Intel Errata AAP53 (model 30) * Intel Errata BD53 (model 44) * - * These chips need to be 'reset' when adding counters by programming - * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5 - * either in sequence on the same PMC or on different PMCs. + * The official story: + * These chips need to be 'reset' when adding counters by programming the + * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either + * in sequence on the same PMC or on different PMCs. + * + * In practise it appears some of these events do in fact count, and + * we need to programm all 4 events. */ -static void intel_pmu_nhm_enable_all(int added) +static void intel_pmu_nhm_workaround(void) { - if (added) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - int i; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + static const unsigned long nhm_magic[4] = { + 0x4300B5, + 0x4300D2, + 0x4300B1, + 0x4300B1 + }; + struct perf_event *event; + int i; + + /* + * The Errata requires below steps: + * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL; + * 2) Configure 4 PERFEVTSELx with the magic events and clear + * the corresponding PMCx; + * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL; + * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL; + * 5) Clear 4 pairs of ERFEVTSELx and PMCx; + */ + + /* + * The real steps we choose are a little different from above. + * A) To reduce MSR operations, we don't run step 1) as they + * are already cleared before this function is called; + * B) Call x86_perf_event_update to save PMCx before configuring + * PERFEVTSELx with magic number; + * C) With step 5), we do clear only when the PERFEVTSELx is + * not used currently. + * D) Call x86_perf_event_set_period to restore PMCx; + */ - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2); - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1); - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5); + /* We always operate 4 pairs of PERF Counters */ + for (i = 0; i < 4; i++) { + event = cpuc->events[i]; + if (event) + x86_perf_event_update(event); + } - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); + for (i = 0; i < 4; i++) { + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]); + wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0); + } - for (i = 0; i < 3; i++) { - struct perf_event *event = cpuc->events[i]; + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); - if (!event) - continue; + for (i = 0; i < 4; i++) { + event = cpuc->events[i]; + if (event) { + x86_perf_event_set_period(event); __x86_pmu_enable_event(&event->hw, - ARCH_PERFMON_EVENTSEL_ENABLE); - } + ARCH_PERFMON_EVENTSEL_ENABLE); + } else + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0); } +} + +static void intel_pmu_nhm_enable_all(int added) +{ + if (added) + intel_pmu_nhm_workaround(); intel_pmu_enable_all(added); } -- cgit v1.2.1 From 07a7795ca2e6e66d00b184efb46bd0e23d90d3fe Mon Sep 17 00:00:00 2001 From: Hans Rosenfeld Date: Wed, 18 Aug 2010 16:19:50 +0200 Subject: x86, cpu: Fix regression in AMD errata checking code A bug in the family-model-stepping matching code caused the presence of errata to go undetected when OSVW was not used. This causes hangs on some K8 systems because the E400 workaround is not enabled. Signed-off-by: Hans Rosenfeld LKML-Reference: <1282141190-930137-1-git-send-email-hans.rosenfeld@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 60a57b13082d..ba5f62f45f01 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -669,7 +669,7 @@ bool cpu_has_amd_erratum(const int *erratum) } /* OSVW unavailable or ID unknown, match family-model-stepping range */ - ms = (cpu->x86_model << 8) | cpu->x86_mask; + ms = (cpu->x86_model << 4) | cpu->x86_mask; while ((range = *erratum++)) if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && (ms >= AMD_MODEL_RANGE_START(range)) && -- cgit v1.2.1 From fd89a137924e0710078c3ae855e7cec1c43cb845 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 16 Aug 2010 14:38:33 +0200 Subject: x86-32: Separate 1:1 pagetables from swapper_pg_dir This patch fixes machine crashes which occur when heavily exercising the CPU hotplug codepaths on a 32-bit kernel. These crashes are caused by AMD Erratum 383 and result in a fatal machine check exception. Here's the scenario: 1. On 32-bit, the swapper_pg_dir page table is used as the initial page table for booting a secondary CPU. 2. To make this work, swapper_pg_dir needs a direct mapping of physical memory in it (the low mappings). By adding those low, large page (2M) mappings (PAE kernel), we create the necessary conditions for Erratum 383 to occur. 3. Other CPUs which do not participate in the off- and onlining game may use swapper_pg_dir while the low mappings are present (when leave_mm is called). For all steps below, the CPU referred to is a CPU that is using swapper_pg_dir, and not the CPU which is being onlined. 4. The presence of the low mappings in swapper_pg_dir can result in TLB entries for addresses below __PAGE_OFFSET to be established speculatively. These TLB entries are marked global and large. 5. When the CPU with such TLB entry switches to another page table, this TLB entry remains because it is global. 6. The process then generates an access to an address covered by the above TLB entry but there is a permission mismatch - the TLB entry covers a large global page not accessible to userspace. 7. Due to this permission mismatch a new 4kb, user TLB entry gets established. Further, Erratum 383 provides for a small window of time where both TLB entries are present. This results in an uncorrectable machine check exception signalling a TLB multimatch which panics the machine. There are two ways to fix this issue: 1. Always do a global TLB flush when a new cr3 is loaded and the old page table was swapper_pg_dir. I consider this a hack hard to understand and with performance implications 2. Do not use swapper_pg_dir to boot secondary CPUs like 64-bit does. This patch implements solution 2. It introduces a trampoline_pg_dir which has the same layout as swapper_pg_dir with low_mappings. This page table is used as the initial page table of the booting CPU. Later in the bringup process, it switches to swapper_pg_dir and does a global TLB flush. This fixes the crashes in our test cases. -v2: switch to swapper_pg_dir right after entering start_secondary() so that we are able to access percpu data which might not be mapped in the trampoline page table. Signed-off-by: Joerg Roedel LKML-Reference: <20100816123833.GB28147@aftab> Signed-off-by: Borislav Petkov Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 8 +++++++- arch/x86/kernel/setup.c | 2 ++ arch/x86/kernel/smpboot.c | 32 +++++++++++++------------------- arch/x86/kernel/trampoline.c | 18 ++++++++++++++++++ 4 files changed, 40 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index ff4c453e13f3..fa8c1b8e09fb 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -334,7 +334,7 @@ ENTRY(startup_32_smp) /* * Enable paging */ - movl $pa(swapper_pg_dir),%eax + movl pa(initial_page_table), %eax movl %eax,%cr3 /* set the page table pointer.. */ movl %cr0,%eax orl $X86_CR0_PG,%eax @@ -614,6 +614,8 @@ ignore_int: .align 4 ENTRY(initial_code) .long i386_start_kernel +ENTRY(initial_page_table) + .long pa(swapper_pg_dir) /* * BSS section @@ -629,6 +631,10 @@ ENTRY(swapper_pg_dir) #endif swapper_pg_fixmap: .fill 1024,4,0 +#ifdef CONFIG_X86_TRAMPOLINE +ENTRY(trampoline_pg_dir) + .fill 1024,4,0 +#endif ENTRY(empty_zero_page) .fill 4096,1,0 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b008e7883207..c3a4fbb2b996 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1014,6 +1014,8 @@ void __init setup_arch(char **cmdline_p) paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); + setup_trampoline_page_table(); + tboot_probe(); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a5e928b0cb5f..abf4a86ffc54 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -73,7 +73,6 @@ #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; -static int low_mappings; #endif /* State of each CPU */ @@ -281,6 +280,18 @@ notrace static void __cpuinit start_secondary(void *unused) * fragile that we want to limit the things done here to the * most necessary things. */ + +#ifdef CONFIG_X86_32 + /* + * Switch away from the trampoline page-table + * + * Do this before cpu_init() because it needs to access per-cpu + * data which may not be mapped in the trampoline page-table. + */ + load_cr3(swapper_pg_dir); + __flush_tlb_all(); +#endif + vmi_bringup(); cpu_init(); preempt_disable(); @@ -299,12 +310,6 @@ notrace static void __cpuinit start_secondary(void *unused) legacy_pic->chip->unmask(0); } -#ifdef CONFIG_X86_32 - while (low_mappings) - cpu_relax(); - __flush_tlb_all(); -#endif - /* This must be done before setting cpu_online_mask */ set_cpu_sibling_map(raw_smp_processor_id()); wmb(); @@ -750,6 +755,7 @@ do_rest: #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); + initial_page_table = __pa(&trampoline_pg_dir); #else clear_tsk_thread_flag(c_idle.idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); @@ -897,20 +903,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; -#ifdef CONFIG_X86_32 - /* init low mem mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); - flush_tlb_all(); - low_mappings = 1; - err = do_boot_cpu(apicid, cpu); - zap_low_mappings(false); - low_mappings = 0; -#else - err = do_boot_cpu(apicid, cpu); -#endif if (err) { pr_debug("do_boot_cpu failed %d\n", err); return -EIO; diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index c652ef62742d..a874495b3673 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -1,6 +1,7 @@ #include #include +#include #include #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) @@ -37,3 +38,20 @@ unsigned long __trampinit setup_trampoline(void) memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } + +void __init setup_trampoline_page_table(void) +{ +#ifdef CONFIG_X86_32 + /* Copy kernel address range */ + clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, + KERNEL_PGD_BOUNDARY)); + + /* Initialize low mappings */ + clone_pgd_range(trampoline_pg_dir, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, + KERNEL_PGD_BOUNDARY)); +#endif +} -- cgit v1.2.1 From 737480a0d525dae13306296da08029dff545bc72 Mon Sep 17 00:00:00 2001 From: KUMANO Syuhei Date: Sun, 15 Aug 2010 15:18:04 +0900 Subject: kprobes/x86: Fix the return address of multiple kretprobes Fix the return address of subsequent kretprobes when multiple kretprobes are set on the same function. For example: # cd /sys/kernel/debug/tracing # echo "r:event1 sys_symlink" > kprobe_events # echo "r:event2 sys_symlink" >> kprobe_events # echo 1 > events/kprobes/enable # ln -s /tmp/foo /tmp/bar (without this patch) # cat trace ln-897 [000] 20404.133727: event1: (kretprobe_trampoline+0x0/0x4c <- sys_symlink) ln-897 [000] 20404.133747: event2: (system_call_fastpath+0x16/0x1b <- sys_symlink) (with this patch) # cat trace ln-740 [000] 13799.491076: event1: (system_call_fastpath+0x16/0x1b <- sys_symlink) ln-740 [000] 13799.491096: event2: (system_call_fastpath+0x16/0x1b <- sys_symlink) Signed-off-by: KUMANO Syuhei Reviewed-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli Cc: Peter Zijlstra Cc: YOSHIFUJI Hideaki LKML-Reference: <1281853084.3254.11.camel@camp10-laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1bfb6cf4dd55..770ebfb349e9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -709,6 +709,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) struct hlist_node *node, *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; + kprobe_opcode_t *correct_ret_addr = NULL; INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); @@ -740,14 +741,34 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) /* another task is sharing our hash bucket */ continue; + orig_ret_address = (unsigned long)ri->ret_addr; + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + + correct_ret_addr = ri->ret_addr; + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { __get_cpu_var(current_kprobe) = &ri->rp->kp; get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; + ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); __get_cpu_var(current_kprobe) = NULL; } - orig_ret_address = (unsigned long)ri->ret_addr; recycle_rp_inst(ri, &empty_rp); if (orig_ret_address != trampoline_address) @@ -759,8 +780,6 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) break; } - kretprobe_assert(ri, orig_ret_address, trampoline_address); - kretprobe_hash_unlock(current, &flags); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { -- cgit v1.2.1 From d7c53c9e822a4fefa13a0cae76f3190bfd0d5c11 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 19 Aug 2010 20:10:29 +0200 Subject: x86, hotplug: Serialize CPU hotplug to avoid bringup concurrency issues When testing cpu hotplug code on 32-bit we kept hitting the "CPU%d: Stuck ??" message due to multiple cores concurrently accessing the cpu_callin_mask, among others. Since these codepaths are not protected from concurrent access due to the fact that there's no sane reason for making an already complex code unnecessarily more complex - we hit the issue only when insanely switching cores off- and online - serialize hotplugging cores on the sysfs level and be done with it. [ v2.1: fix !HOTPLUG_CPU build ] Cc: Signed-off-by: Borislav Petkov LKML-Reference: <20100819181029.GC17171@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index abf4a86ffc54..8b3bfc4dd708 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -90,6 +90,25 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 }; static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) + +/* + * We need this for trampoline_base protection from concurrent accesses when + * off- and onlining cores wildly. + */ +static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); + +void cpu_hotplug_driver_lock() +{ + mutex_lock(&x86_cpu_hotplug_driver_mutex); +} + +void cpu_hotplug_driver_unlock() +{ + mutex_unlock(&x86_cpu_hotplug_driver_mutex); +} + +ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } +ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } #else static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define get_idle_for_cpu(x) (idle_thread_array[(x)]) -- cgit v1.2.1 From 05e407603e527f9d808dd3866d3a17c2ce4dfcc5 Mon Sep 17 00:00:00 2001 From: Daniel Kiper Date: Fri, 20 Aug 2010 00:46:16 +0200 Subject: x86, apic: Fix apic=debug boot crash Fix a boot crash when apic=debug is used and the APIC is not properly initialized. This issue appears during Xen Dom0 kernel boot but the fix is generic and the crash could occur on real hardware as well. Signed-off-by: Daniel Kiper Cc: xen-devel@lists.xensource.com Cc: konrad.wilk@oracle.com Cc: jeremy@goop.org Cc: # .35.x, .34.x, .33.x, .32.x LKML-Reference: <20100819224616.GB9967@router-fw-old.local.net-space.pl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4dc0084ec1b1..f1efebaf5510 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1728,6 +1728,8 @@ __apicdebuginit(void) print_IO_APIC(void) struct irq_pin_list *entry; cfg = desc->chip_data; + if (!cfg) + continue; entry = cfg->irq_2_pin; if (!entry) continue; -- cgit v1.2.1 From cd7240c0b900eb6d690ccee088a6c9b46dae815a Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 19 Aug 2010 17:03:38 -0700 Subject: x86, tsc, sched: Recompute cyc2ns_offset's during resume from sleep states TSC's get reset after suspend/resume (even on cpu's with invariant TSC which runs at a constant rate across ACPI P-, C- and T-states). And in some systems BIOS seem to reinit TSC to arbitrary large value (still sync'd across cpu's) during resume. This leads to a scenario of scheduler rq->clock (sched_clock_cpu()) less than rq->age_stamp (introduced in 2.6.32). This leads to a big value returned by scale_rt_power() and the resulting big group power set by the update_group_power() is causing improper load balancing between busy and idle cpu's after suspend/resume. This resulted in multi-threaded workloads (like kernel-compilation) go slower after suspend/resume cycle on core i5 laptops. Fix this by recomputing cyc2ns_offset's during resume, so that sched_clock() continues from the point where it was left off during suspend. Reported-by: Florian Pritz Signed-off-by: Suresh Siddha Cc: # [v2.6.32+] Signed-off-by: Peter Zijlstra LKML-Reference: <1282262618.2675.24.camel@sbsiddha-MOBL3.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ce8e50239332..d632934cb638 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -626,6 +626,44 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) local_irq_restore(flags); } +static unsigned long long cyc2ns_suspend; + +void save_sched_clock_state(void) +{ + if (!sched_clock_stable) + return; + + cyc2ns_suspend = sched_clock(); +} + +/* + * Even on processors with invariant TSC, TSC gets reset in some the + * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to + * arbitrary value (still sync'd across cpu's) during resume from such sleep + * states. To cope up with this, recompute the cyc2ns_offset for each cpu so + * that sched_clock() continues from the point where it was left off during + * suspend. + */ +void restore_sched_clock_state(void) +{ + unsigned long long offset; + unsigned long flags; + int cpu; + + if (!sched_clock_stable) + return; + + local_irq_save(flags); + + get_cpu_var(cyc2ns_offset) = 0; + offset = cyc2ns_suspend - sched_clock(); + + for_each_possible_cpu(cpu) + per_cpu(cyc2ns_offset, cpu) = offset; + + local_irq_restore(flags); +} + #ifdef CONFIG_CPU_FREQ /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency -- cgit v1.2.1 From 8d330919927ea31fa083b5a80084dc991da813a0 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Wed, 25 Aug 2010 21:06:32 +0000 Subject: perf, x86, Pentium4: Clear the P4_CCCR_FORCE_OVF flag If on Pentium4 CPUs the FORCE_OVF flag is set then an NMI happens on every event, which can generate a flood of NMIs. Clear it. Reported-by: Vince Weaver Signed-off-by: Lin Ming Signed-off-by: Cyrill Gorcunov Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index febb12cea795..7e578e9cc58b 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -497,6 +497,8 @@ static int p4_hw_config(struct perf_event *event) event->hw.config |= event->attr.config & (p4_config_pack_escr(P4_ESCR_MASK_HT) | p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); + + event->hw.config &= ~P4_CCCR_FORCE_OVF; } rc = x86_setup_perfctr(event); -- cgit v1.2.1