From 6ec3cfeca04622e3d80c9270191cd7f5f88214af Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Mon, 13 Apr 2009 15:20:58 -0700 Subject: x86, irq: Remove IRQ_DISABLED check in process context IRQ move As discussed in the thread here: http://marc.info/?l=linux-kernel&m=123964468521142&w=2 Eric W. Biederman observed: > It looks like some additional bugs have slipped in since last I looked. > > set_irq_affinity does this: > ifdef CONFIG_GENERIC_PENDING_IRQ > if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { > cpumask_copy(desc->affinity, cpumask); > desc->chip->set_affinity(irq, cpumask); > } else { > desc->status |= IRQ_MOVE_PENDING; > cpumask_copy(desc->pending_mask, cpumask); > } > #else > > That IRQ_DISABLED case is a software state and as such it has nothing to > do with how safe it is to move an irq in process context. [...] > > The only reason we migrate MSIs in interrupt context today is that there > wasn't infrastructure for support migration both in interrupt context > and outside of it. Yes. The idea here was to force the MSI migration to happen in process context. One of the patches in the series did disable_irq(dev->irq); irq_set_affinity(dev->irq, cpumask_of(dev->cpu)); enable_irq(dev->irq); with the above patch adding irq/manage code check for interrupt disabled and moving the interrupt in process context. IIRC, there was no IRQ_MOVE_PCNTXT when we were developing this HPET code and we ended up having this ugly hack. IRQ_MOVE_PCNTXT was there when we eventually submitted the patch upstream. But, looks like I did a blind rebasing instead of using IRQ_MOVE_PCNTXT in hpet MSI code. Below patch fixes this. i.e., revert commit 932775a4ab622e3c99bd59f14cc and add PCNTXT to HPET MSI setup. Also removes copying of desc->affinity in generic code as set_affinity routines are doing it internally. Reported-by: "Eric W. Biederman" Signed-off-by: Venkatesh Pallipadi Acked-by: "Eric W. Biederman" Cc: "Li Shaohua" Cc: Gary Hade Cc: "lcm@us.ibm.com" Cc: suresh.b.siddha@intel.com LKML-Reference: <20090413222058.GB8211@linux-os.sc.intel.com> Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7e2e7dd4cd2f..2734eca59243 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -109,10 +109,9 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - cpumask_copy(desc->affinity, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) desc->chip->set_affinity(irq, cpumask); - } else { + else { desc->status |= IRQ_MOVE_PENDING; cpumask_copy(desc->pending_mask, cpumask); } -- cgit v1.2.1 From c8a250058656495be02c00de61e26b017c86ef00 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Apr 2009 09:40:49 +0200 Subject: lockdep: more robust lockdep_map init sequence Steven Rostedt reported: > OK, I think I figured this bug out. This is a lockdep issue with respect > to tracepoints. > > The trace points in lockdep are called all the time. Outside the lockdep > logic. But if lockdep were to trigger an error / warning (which this run > did) we might be in trouble. For new locks, like the dentry->d_lock, that > are created, they will not get a name: > > void lockdep_init_map(struct lockdep_map *lock, const char *name, > struct lock_class_key *key, int subclass) > { > if (unlikely(!debug_locks)) > return; > > When a problem is found by lockdep, debug_locks becomes false. Thus we > stop allocating names for locks. This dentry->d_lock I had, now has no > name. Worse yet, I have CONFIG_DEBUG_VM set, that scrambles non > initialized memory. Thus, when the trace point was hit, it had junk for > the lock->name, and the machine crashed. Ah, nice catch. I think we should put at least the name in regardless. Ensure we at least initialize the trivial entries of the depmap so that they can be relied upon, even when lockdep itself decided to pack up and go home. [ Impact: fix lock tracing after lockdep warnings. ] Reported-by: Steven Rostedt Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Andrew Morton Cc: Frederic Weisbecker LKML-Reference: <1239954049.23397.4156.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b0f011866969..accb40cdb12a 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2490,13 +2490,20 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - if (unlikely(!debug_locks)) + lock->class_cache = NULL; +#ifdef CONFIG_LOCK_STAT + lock->cpu = raw_smp_processor_id(); +#endif + + if (DEBUG_LOCKS_WARN_ON(!name)) { + lock->name = "NULL"; return; + } + + lock->name = name; if (DEBUG_LOCKS_WARN_ON(!key)) return; - if (DEBUG_LOCKS_WARN_ON(!name)) - return; /* * Sanity check, the lock-class key must be persistent: */ @@ -2505,12 +2512,11 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, DEBUG_LOCKS_WARN_ON(1); return; } - lock->name = name; lock->key = key; - lock->class_cache = NULL; -#ifdef CONFIG_LOCK_STAT - lock->cpu = raw_smp_processor_id(); -#endif + + if (unlikely(!debug_locks)) + return; + if (subclass) register_lock_class(lock, subclass, 1); } -- cgit v1.2.1 From b48ccb095a0c9257241261ec2bd1cbb1bdabc48b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 23 Apr 2009 09:36:52 +0200 Subject: locking: clarify kernel-taint warning message Andi Kleen reported this message triggering on non-lockdep kernels: Disabling lockdep due to kernel taint Clarify the message to say 'lock debugging' - debug_locks_off() turns off all things lock debugging, not just lockdep. [ Impact: change kernel warning message text ] Reported-by: Andi Kleen Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 934fb377f4b3..3dcaa1661357 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -221,7 +221,7 @@ void add_taint(unsigned flag) * post-warning case. */ if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) - printk(KERN_WARNING "Disabling lockdep due to kernel taint\n"); + printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); } -- cgit v1.2.1 From cad81bc2529ab8c62b6fdc83a1c0c7f4a87209eb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 27 Apr 2009 01:41:34 +0200 Subject: ptrace: ptrace_attach: fix the usage of ->cred_exec_mutex ptrace_attach() needs task->cred_exec_mutex, not current->cred_exec_mutex. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Acked-by: David Howells Signed-off-by: James Morris --- kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index dfcd83ceee3b..0692ab5a0d67 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -188,7 +188,7 @@ int ptrace_attach(struct task_struct *task) /* Protect exec's credential calculations against our interference; * SUID, SGID and LSM creds get determined differently under ptrace. */ - retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + retval = mutex_lock_interruptible(&task->cred_exec_mutex); if (retval < 0) goto out; @@ -232,7 +232,7 @@ repeat: bad: write_unlock_irqrestore(&tasklist_lock, flags); task_unlock(task); - mutex_unlock(¤t->cred_exec_mutex); + mutex_unlock(&task->cred_exec_mutex); out: return retval; } -- cgit v1.2.1 From 7267fa6819467669f5cc2ba81a615dcc88158b4b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:16:21 -0400 Subject: tracing: fix ref count in splice pages The pages allocated for the splice binary buffer did not initialize the ref count correctly. This caused pages not to be freed and causes a drastic memory leak. Thanks to logdev I was able to trace the tracer to find where the leak was. [ Impact: stop memory leak when using splice ] Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1ce5dc6372b8..a884c09006c4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3448,6 +3448,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (!ref) break; + ref->ref = 1; ref->buffer = info->tr->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer); if (!ref->page) { -- cgit v1.2.1 From f5f293a4e3d0a0c52cec31de6762c95050156516 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Apr 2009 14:44:49 +0200 Subject: sched: account system time properly Andrew Gallatin reported that IRQ and SOFTIRQ times were sometime not reported correctly on recent kernels, and even bisected to commit 457533a7d3402d1d91fbc125c8bd1bd16dcd3cd4 ([PATCH] fix scaled & unscaled cputime accounting) as the first bad commit. Further analysis pointed that commit 79741dd35713ff4f6fd0eafd59fa94e8a4ba922d ([PATCH] idle cputime accounting) was the real cause of the problem. account_process_tick() was not taking into account timer IRQ interrupting the idle task servicing a hard or soft irq. On mostly idle cpu, irqs were thus not accounted and top or mpstat could tell user/admin that cpu was 100 % idle, 0.00 % irq, 0.00 % softirq, while it was not. [ Impact: fix occasionally incorrect CPU statistics in top/mpstat ] Reported-by: Andrew Gallatin Re-reported-by: Andrew Morton Signed-off-by: Eric Dumazet Acked-by: Martin Schwidefsky Cc: rick.jones2@hp.com Cc: brice@myri.com Cc: Paul Mackerras Cc: Benjamin Herrenschmidt LKML-Reference: <49F84BC1.7080602@cosmosbay.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b902e587a3a0..26efa475bdc1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4732,7 +4732,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (user_tick) account_user_time(p, one_jiffy, one_jiffy_scaled); - else if (p != rq->idle) + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) account_system_time(p, HARDIRQ_OFFSET, one_jiffy, one_jiffy_scaled); else -- cgit v1.2.1 From 6e85c5ba73c07b990798087e9b858c065db2b234 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Wed, 29 Apr 2009 19:14:32 -0400 Subject: kernel/posix-cpu-timers.c: fix sparse warning Sparse reports the following in kernel/posix-cpu-timers.c: warning: symbol 'firing' shadows an earlier one Signed-off-by: H Hartley Sweeten Cc: Subrata Modak LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c9dcf98b4463..bece7c0b67b2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1420,19 +1420,19 @@ void run_posix_cpu_timers(struct task_struct *tsk) * timer call will interfere. */ list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { - int firing; + int cpu_firing; + spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.entry); - firing = timer->it.cpu.firing; + cpu_firing = timer->it.cpu.firing; timer->it.cpu.firing = 0; /* * The firing flag is -1 if we collided with a reset * of the timer, which already reported this * almost-firing as an overrun. So don't generate an event. */ - if (likely(firing >= 0)) { + if (likely(cpu_firing >= 0)) cpu_timer_fire(timer); - } spin_unlock(&timer->it_lock); } } -- cgit v1.2.1 From d7226fb6ec5d4f325e4e7fd905894e2ea3eb3ae0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 May 2009 15:16:04 +0200 Subject: Revert "genirq: assert that irq handlers are indeed running in hardirq context" This reverts commit 044d408409cc4e1bc75c886e27ca85c270db104c. The commit added a warning when handle_IRQ_event() is called outside of hard interrupt context. This breaks the generic tasklet based interrupt resend mechanism which is used when the hardware has no way to retrigger the interrupt. So we get a warning for a use case which is correct and worked for years. Remove it. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d82142be8dd2..26e08754744f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -363,8 +363,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; - WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!"); - if (!(action->flags & IRQF_DISABLED)) local_irq_enable_in_hardirq(); -- cgit v1.2.1 From 74a03b69d1b5ce00a568e142ca97e76b7f5239c6 Mon Sep 17 00:00:00 2001 From: john stultz Date: Fri, 1 May 2009 13:10:25 -0700 Subject: clockevents: prevent endless loop in tick_handle_periodic() tick_handle_periodic() can lock up hard when a one shot clock event device is used in combination with jiffies clocksource. Avoid an endless loop issue by requiring that a highres valid clocksource be installed before we call tick_periodic() in a loop when using ONESHOT mode. The result is we will only increment jiffies once per interrupt until a continuous hardware clocksource is available. Without this, we can run into a endless loop, where each cycle through the loop, jiffies is updated which increments time by tick_period or more (due to clock steering), which can cause the event programming to think the next event was before the newly incremented time and fail causing tick_periodic() to be called again and the whole process loops forever. [ Impact: prevent hard lock up ] Signed-off-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/time/tick-common.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 21a5ca849514..83c4417b6a3c 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -93,7 +93,17 @@ void tick_handle_periodic(struct clock_event_device *dev) for (;;) { if (!clockevents_program_event(dev, next, ktime_get())) return; - tick_periodic(cpu); + /* + * Have to be careful here. If we're in oneshot mode, + * before we call tick_periodic() in a loop, we need + * to be sure we're using a real hardware clocksource. + * Otherwise we could get trapped in an infinite + * loop, as the tick_periodic() increments jiffies, + * when then will increment time, posibly causing + * the loop to trigger again and again. + */ + if (timekeeping_valid_for_hres()) + tick_periodic(cpu); next = ktime_add(next, tick_period); } } -- cgit v1.2.1 From 9e4a5bda89034502fb144331e71a0efdfd5fae97 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 30 Apr 2009 15:08:57 -0700 Subject: mm: prevent divide error for small values of vm_dirty_bytes Avoid setting less than two pages for vm_dirty_bytes: this is necessary to avoid potential division by 0 (like the following) in get_dirty_limits(). [ 49.951610] divide error: 0000 [#1] PREEMPT SMP [ 49.952195] last sysfs file: /sys/devices/pci0000:00/0000:00:01.1/host0/target0:0:0/0:0:0:0/block/sda/uevent [ 49.952195] CPU 1 [ 49.952195] Modules linked in: pcspkr [ 49.952195] Pid: 3064, comm: dd Not tainted 2.6.30-rc3 #1 [ 49.952195] RIP: 0010:[] [] get_dirty_limits+0xe9/0x2c0 [ 49.952195] RSP: 0018:ffff88001de03a98 EFLAGS: 00010202 [ 49.952195] RAX: 00000000000000c0 RBX: ffff88001de03b80 RCX: 28f5c28f5c28f5c3 [ 49.952195] RDX: 0000000000000000 RSI: 00000000000000c0 RDI: 0000000000000000 [ 49.952195] RBP: ffff88001de03ae8 R08: 0000000000000000 R09: 0000000000000000 [ 49.952195] R10: ffff88001ddda9a0 R11: 0000000000000001 R12: 0000000000000001 [ 49.952195] R13: ffff88001fbc8218 R14: ffff88001de03b70 R15: ffff88001de03b78 [ 49.952195] FS: 00007fe9a435b6f0(0000) GS:ffff8800025d9000(0000) knlGS:0000000000000000 [ 49.952195] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 49.952195] CR2: 00007fe9a39ab000 CR3: 000000001de38000 CR4: 00000000000006e0 [ 49.952195] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 49.952195] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 49.952195] Process dd (pid: 3064, threadinfo ffff88001de02000, task ffff88001ddda250) [ 49.952195] Stack: [ 49.952195] ffff88001fa0de00 ffff88001f2dbd70 ffff88001f9fe800 000080b900000000 [ 49.952195] 00000000000000c0 ffff8800027a6100 0000000000000400 ffff88001fbc8218 [ 49.952195] 0000000000000000 0000000000000600 ffff88001de03bb8 ffffffff802d3ed7 [ 49.952195] Call Trace: [ 49.952195] [] balance_dirty_pages_ratelimited_nr+0x1d7/0x3f0 [ 49.952195] [] ? ext3_writeback_write_end+0x9e/0x120 [ 49.952195] [] generic_file_buffered_write+0x12f/0x330 [ 49.952195] [] __generic_file_aio_write_nolock+0x26d/0x460 [ 49.952195] [] ? generic_file_aio_write+0x52/0xd0 [ 49.952195] [] generic_file_aio_write+0x69/0xd0 [ 49.952195] [] ext3_file_write+0x26/0xc0 [ 49.952195] [] do_sync_write+0xf1/0x140 [ 49.952195] [] ? get_lock_stats+0x2a/0x60 [ 49.952195] [] ? autoremove_wake_function+0x0/0x40 [ 49.952195] [] vfs_write+0xcb/0x190 [ 49.952195] [] sys_write+0x50/0x90 [ 49.952195] [] system_call_fastpath+0x16/0x1b [ 49.952195] Code: 00 00 00 2b 05 09 1c 17 01 48 89 c6 49 0f af f4 48 c1 ee 02 48 89 f0 48 f7 e1 48 89 d6 31 d2 48 c1 ee 02 48 0f af 75 d0 48 89 f0 <48> f7 f7 41 8b 95 ac 01 00 00 48 89 c7 49 0f af d4 48 c1 ea 02 [ 49.952195] RIP [] get_dirty_limits+0xe9/0x2c0 [ 49.952195] RSP [ 50.096523] ---[ end trace 008d7aa02f244d7b ]--- Signed-off-by: Andrea Righi Cc: Peter Zijlstra Cc: David Rientjes Cc: Dave Chinner Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e3d2c7dd59b9..ea78fa101ad6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -103,6 +103,9 @@ static unsigned long one_ul = 1; static int one_hundred = 100; static int one_thousand = 1000; +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; @@ -1006,7 +1009,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &dirty_bytes_handler, .strategy = &sysctl_intvec, - .extra1 = &one_ul, + .extra1 = &dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", -- cgit v1.2.1 From 381a80e6df396eaabef2c00f85974a4579ac1c70 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 6 May 2009 16:02:50 -0700 Subject: inotify: use GFP_NOFS in kernel_event() to work around a lockdep false-positive There is what we believe to be a false positive reported by lockdep. inotify_inode_queue_event() => take inotify_mutex => kernel_event() => kmalloc() => SLOB => alloc_pages_node() => page reclaim => slab reclaim => dcache reclaim => inotify_inode_is_dead => take inotify_mutex => deadlock The plan is to fix this via lockdep annotation, but that is proving to be quite involved. The patch flips the allocation over to GFP_NFS to shut the warning up, for the 2.6.30 release. Hopefully we will fix this for real in 2.6.31. I'll queue a patch in -mm to switch it back to GFP_KERNEL so we don't forget. ================================= [ INFO: inconsistent lock state ] 2.6.30-rc2-next-20090417 #203 --------------------------------- inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. kswapd0/380 [HC0[0]:SC0[0]:HE1:SE1] takes: (&inode->inotify_mutex){+.+.?.}, at: [] inotify_inode_is_dead+0x35/0xb0 {RECLAIM_FS-ON-W} state was registered at: [] mark_held_locks+0x68/0x90 [] lockdep_trace_alloc+0xf5/0x100 [] __kmalloc_node+0x31/0x1e0 [] kernel_event+0xe2/0x190 [] inotify_dev_queue_event+0x126/0x230 [] inotify_inode_queue_event+0xc6/0x110 [] vfs_create+0xcd/0x140 [] do_filp_open+0x88d/0xa20 [] do_sys_open+0x98/0x140 [] sys_open+0x20/0x30 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff irq event stamp: 690455 hardirqs last enabled at (690455): [] _spin_unlock_irqrestore+0x44/0x80 hardirqs last disabled at (690454): [] _spin_lock_irqsave+0x32/0xa0 softirqs last enabled at (690178): [] __do_softirq+0x202/0x220 softirqs last disabled at (690157): [] call_softirq+0x1c/0x50 other info that might help us debug this: 2 locks held by kswapd0/380: #0: (shrinker_rwsem){++++..}, at: [] shrink_slab+0x37/0x180 #1: (&type->s_umount_key#17){++++..}, at: [] shrink_dcache_memory+0x11f/0x1e0 stack backtrace: Pid: 380, comm: kswapd0 Not tainted 2.6.30-rc2-next-20090417 #203 Call Trace: [] print_usage_bug+0x19f/0x200 [] ? save_stack_trace+0x2f/0x50 [] mark_lock+0x4bb/0x6d0 [] ? check_usage_forwards+0x0/0xc0 [] __lock_acquire+0xc62/0x1ae0 [] ? slob_free+0x10c/0x370 [] lock_acquire+0xe1/0x120 [] ? inotify_inode_is_dead+0x35/0xb0 [] mutex_lock_nested+0x63/0x420 [] ? inotify_inode_is_dead+0x35/0xb0 [] ? inotify_inode_is_dead+0x35/0xb0 [] ? sched_clock+0x9/0x10 [] ? lock_release_holdtime+0x35/0x1c0 [] inotify_inode_is_dead+0x35/0xb0 [] dentry_iput+0xbc/0xe0 [] d_kill+0x33/0x60 [] __shrink_dcache_sb+0x2d3/0x350 [] shrink_dcache_memory+0x15a/0x1e0 [] shrink_slab+0x125/0x180 [] kswapd+0x560/0x7a0 [] ? isolate_pages_global+0x0/0x2c0 [] ? autoremove_wake_function+0x0/0x40 [] ? trace_hardirqs_on+0xd/0x10 [] ? kswapd+0x0/0x7a0 [] kthread+0x5b/0xa0 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? kthread+0x0/0xa0 [] ? child_rip+0x0/0x20 [eparis@redhat.com: fix audit too] Cc: Al Viro Cc: Matt Mackall Cc: Christoph Lameter Signed-off-by: Wu Fengguang Signed-off-by: Eric Paris Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a6fe71fd5d1b..713098ee5a02 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1028,7 +1028,7 @@ static void audit_update_watch(struct audit_parent *parent, if (audit_enabled) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "auid=%u ses=%u", audit_get_loginuid(current), @@ -1067,7 +1067,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent) e = container_of(r, struct audit_entry, rule); if (audit_enabled) { struct audit_buffer *ab; - ab = audit_log_start(NULL, GFP_KERNEL, + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); audit_log_format(ab, "auid=%u ses=%u", audit_get_loginuid(current), -- cgit v1.2.1 From 57adc4d2dbf968fdbe516359688094eef4d46581 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 6 May 2009 16:02:53 -0700 Subject: Eliminate thousands of warnings with gcc 3.2 build When building with gcc 3.2 I get thousands of warnings such as include/linux/gfp.h: In function `allocflags_to_migratetype': include/linux/gfp.h:105: warning: null format string due to passing a NULL format string to warn_slowpath() in #define __WARN() warn_slowpath(__FILE__, __LINE__, NULL) Split this case out into a separate call. This also shrinks the kernel slightly: text data bss dec hex filename 4802274 707668 712704 6222646 5ef336 vmlinux text data bss dec hex filename 4799027 703572 712704 6215303 5ed687 vmlinux due to removeing one argument from the commonly-called __WARN(). [akpm@linux-foundation.org: reduce scope of `empty'] Acked-by: Jesper Nilsson Acked-by: Johannes Weiner Acked-by: Arjan van de Ven Signed-off-by: Andi Kleen Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 3dcaa1661357..874ecf1307ae 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -340,7 +340,7 @@ void oops_exit(void) } #ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath(const char *file, int line, const char *fmt, ...) +void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) { va_list args; char function[KSYM_SYMBOL_LEN]; @@ -356,7 +356,7 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...) if (board) printk(KERN_WARNING "Hardware name: %s\n", board); - if (fmt) { + if (*fmt) { va_start(args, fmt); vprintk(fmt, args); va_end(args); @@ -367,7 +367,14 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...) print_oops_end_marker(); add_taint(TAINT_WARN); } -EXPORT_SYMBOL(warn_slowpath); +EXPORT_SYMBOL(warn_slowpath_fmt); + +void warn_slowpath_null(const char *file, int line) +{ + static const char *empty = ""; + warn_slowpath_fmt(file, line, empty); +} +EXPORT_SYMBOL(warn_slowpath_null); #endif #ifdef CONFIG_CC_STACKPROTECTOR -- cgit v1.2.1 From 201517a7f3ec497fff545a7659c6c876f89f9054 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 7 May 2009 16:31:26 -0400 Subject: kprobes: fix to use text_mutex around arm/disarm kprobe Fix kprobes to lock text_mutex around some arch_arm/disarm_kprobe() which are newly added by commit de5bd88d5a5cce3cacea904d3503e5ebdb3852a2. Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: Mathieu Desnoyers Cc: Jim Keniston Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a5e74ddee0e2..c0fa54b276d9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -319,6 +319,22 @@ struct kprobe __kprobes *get_kprobe(void *addr) return NULL; } +/* Arm a kprobe with text_mutex */ +static void __kprobes arm_kprobe(struct kprobe *kp) +{ + mutex_lock(&text_mutex); + arch_arm_kprobe(kp); + mutex_unlock(&text_mutex); +} + +/* Disarm a kprobe with text_mutex */ +static void __kprobes disarm_kprobe(struct kprobe *kp) +{ + mutex_lock(&text_mutex); + arch_disarm_kprobe(kp); + mutex_unlock(&text_mutex); +} + /* * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list @@ -538,7 +554,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) ap->flags &= ~KPROBE_FLAG_DISABLED; if (!kprobes_all_disarmed) /* Arm the breakpoint again. */ - arch_arm_kprobe(ap); + arm_kprobe(ap); } return 0; } @@ -789,11 +805,8 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) * enabled and not gone - otherwise, the breakpoint would * already have been removed. We save on flushing icache. */ - if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) { - mutex_lock(&text_mutex); - arch_disarm_kprobe(p); - mutex_unlock(&text_mutex); - } + if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) + disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); } else { if (p->break_handler && !kprobe_gone(p)) @@ -810,7 +823,7 @@ noclean: if (!kprobe_disabled(old_p)) { try_to_disable_aggr_kprobe(old_p); if (!kprobes_all_disarmed && kprobe_disabled(old_p)) - arch_disarm_kprobe(old_p); + disarm_kprobe(old_p); } } return 0; @@ -1364,7 +1377,7 @@ int __kprobes disable_kprobe(struct kprobe *kp) try_to_disable_aggr_kprobe(p); if (!kprobes_all_disarmed && kprobe_disabled(p)) - arch_disarm_kprobe(p); + disarm_kprobe(p); out: mutex_unlock(&kprobe_mutex); return ret; @@ -1393,7 +1406,7 @@ int __kprobes enable_kprobe(struct kprobe *kp) } if (!kprobes_all_disarmed && kprobe_disabled(p)) - arch_arm_kprobe(p); + arm_kprobe(p); p->flags &= ~KPROBE_FLAG_DISABLED; if (p != kp) -- cgit v1.2.1 From 92d23f703c608fcb2c8edd74a3fd0f4031e18606 Mon Sep 17 00:00:00 2001 From: Ron Date: Fri, 8 May 2009 22:54:49 +0930 Subject: sched: Fix fallback sched_clock()'s offset when using jiffies Account for the initial offset to the jiffy count. [ Impact: fix printk timestamps on architectures using fallback sched_clock() ] Signed-off-by: Ron Lee Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 819f17ac796e..e1d16c9a7680 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -38,7 +38,8 @@ */ unsigned long long __attribute__((weak)) sched_clock(void) { - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); + return (unsigned long long)(jiffies - INITIAL_JIFFIES) + * (NSEC_PER_SEC / HZ); } static __read_mostly int sched_clock_running; -- cgit v1.2.1 From 6f5bbff9a1b7d6864a495763448a363bbfa96324 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 6 May 2009 01:34:22 -0400 Subject: Convert obvious places to deactivate_locked_super() Signed-off-by: Al Viro --- kernel/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 382109b5baeb..a7267bfd3765 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1133,8 +1133,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, free_cg_links: free_cg_links(&tmp_cg_links); drop_new_super: - up_write(&sb->s_umount); - deactivate_super(sb); + deactivate_locked_super(sb); return ret; } -- cgit v1.2.1 From d80c19df5fcceb8c741e96f09f275c2da719efef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 12 May 2009 16:29:13 +0200 Subject: lockdep: increase MAX_LOCKDEP_ENTRIES and MAX_LOCKDEP_CHAINS Now that lockdep coverage has increased it has become easier to run out of entries: [ 21.401387] BUG: MAX_LOCKDEP_ENTRIES too low! [ 21.402007] turning off the locking correctness validator. [ 21.402007] Pid: 1555, comm: S99local Not tainted 2.6.30-rc5-tip #2 [ 21.402007] Call Trace: [ 21.402007] [] add_lock_to_list+0x53/0xba [ 21.402007] [] ? lookup_mnt+0x19/0x53 [ 21.402007] [] check_prev_add+0x14b/0x1c7 [ 21.402007] [] validate_chain+0x474/0x52a [ 21.402007] [] __lock_acquire+0x342/0x3c7 [ 21.402007] [] lock_acquire+0xc1/0xe5 [ 21.402007] [] ? lookup_mnt+0x19/0x53 [ 21.402007] [] _spin_lock+0x31/0x66 Double the size - as we've done in the past. [ Impact: allow lockdep to cover more locks ] Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/lockdep_internals.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index a2cc7e9a6e84..699a2ac3a0d7 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -54,9 +54,9 @@ enum { * table (if it's not there yet), and we check it for lock order * conflicts and deadlocks. */ -#define MAX_LOCKDEP_ENTRIES 8192UL +#define MAX_LOCKDEP_ENTRIES 16384UL -#define MAX_LOCKDEP_CHAINS_BITS 14 +#define MAX_LOCKDEP_CHAINS_BITS 15 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) #define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) -- cgit v1.2.1 From cd17cbfda004fe5f406c01b318c6378d9895896f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 May 2009 11:32:24 +0200 Subject: Revert "mm: add /proc controls for pdflush threads" This reverts commit fafd688e4c0c34da0f3de909881117d374e4c7af. Work is progressing to switch away from pdflush as the process backing for flushing out dirty data. So it seems pointless to add more knobs to control pdflush threads. The original author of the patch did not have any specific use cases for adding the knobs, so we can easily revert this before 2.6.30 to avoid having to maintain this API forever. Signed-off-by: Jens Axboe --- kernel/sysctl.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ea78fa101ad6..b2970d56fb76 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -101,7 +101,6 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static unsigned long one_ul = 1; static int one_hundred = 100; -static int one_thousand = 1000; /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -1033,28 +1032,6 @@ static struct ctl_table vm_table[] = { .mode = 0444 /* read-only*/, .proc_handler = &proc_dointvec, }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "nr_pdflush_threads_min", - .data = &nr_pdflush_threads_min, - .maxlen = sizeof nr_pdflush_threads_min, - .mode = 0644 /* read-write */, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &one, - .extra2 = &nr_pdflush_threads_max, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "nr_pdflush_threads_max", - .data = &nr_pdflush_threads_max, - .maxlen = sizeof nr_pdflush_threads_max, - .mode = 0644 /* read-write */, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &nr_pdflush_threads_min, - .extra2 = &one_thousand, - }, { .ctl_name = VM_SWAPPINESS, .procname = "swappiness", -- cgit v1.2.1 From 364b5b7b1d793a7f98be55b6b154716dcae78dfc Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 13 May 2009 21:56:59 -0500 Subject: sysrq, intel_fb: fix sysrq g collision Commit 79e539453b34e35f39299a899d263b0a1f1670bd introduced a regression where you cannot use sysrq 'g' to enter kgdb. The solution is to move the intel fb sysrq over to V for video instead of G for graphics. The SMP VOYAGER code to register for the sysrq-v is not anywhere to be found in the mainline kernel, so the comments in the code were cleaned up as well. This patch also cleans up the sysrq definitions for kgdb to make it generic for the kernel debugger, such that the sysrq 'g' can be used in the future to enter a gdbstub or another kernel debugger. Signed-off-by: Jason Wessel Acked-by: Jesse Barnes Acked-by: Randy Dunlap Signed-off-by: Andrew Morton --- kernel/kgdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index e4dcfb2272a4..9147a3190c9d 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1583,8 +1583,8 @@ static void sysrq_handle_gdb(int key, struct tty_struct *tty) static struct sysrq_key_op sysrq_gdb_op = { .handler = sysrq_handle_gdb, - .help_msg = "Gdb", - .action_msg = "GDB", + .help_msg = "debug(G)", + .action_msg = "DEBUG", }; #endif -- cgit v1.2.1 From 88fc86c283d9c3854e67e4155808027bc2519eb6 Mon Sep 17 00:00:00 2001 From: GeunSik Lim Date: Thu, 14 May 2009 17:23:38 +0900 Subject: tracing: Append prompt in /debug/tracing/README file append prompt in /debug/tracing/README file. This is trivial issue. Fix typo Mini Howto file(README) for ftrace. [ Impact: cleanup ] Signed-off-by: GeunSik Lim Acked-by: Steven Rostedt Cc: williams LKML-Reference: <1242289418.31161.45.camel@centos51> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a884c09006c4..cda81ec58d9f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2380,7 +2380,7 @@ static const char readme_msg[] = "# echo print-parent > /debug/tracing/trace_options\n" "# echo 1 > /debug/tracing/tracing_enabled\n" "# cat /debug/tracing/trace > /tmp/trace.txt\n" - "echo 0 > /debug/tracing/tracing_enabled\n" + "# echo 0 > /debug/tracing/tracing_enabled\n" ; static ssize_t -- cgit v1.2.1 From 4484079d517c2b6521621be0b1ea246ccc55c7d7 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 15 May 2009 23:30:50 +0200 Subject: PM: check sysdev_suspend(PMSG_FREEZE) return value Check the return value of sysdev_suspend(). I think this was a typo. Without this change, the following "if" check is always false. I also changed the error message so it's distinguishable from the similar message a few lines above. Signed-off-by: Bjorn Helgaas Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/disk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index e71ca9cd81b2..b0dc9e7a0d17 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -241,9 +241,9 @@ static int create_image(int platform_mode) local_irq_disable(); - sysdev_suspend(PMSG_FREEZE); + error = sysdev_suspend(PMSG_FREEZE); if (error) { - printk(KERN_ERR "PM: Some devices failed to power down, " + printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); goto Enable_irqs; } -- cgit v1.2.1 From 0f6f49a8cd0163fdb1723ed29f01fc65177108dc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 May 2009 13:41:28 -0700 Subject: Fix caller information for warn_slowpath_null Ian Campbell noticed that since "Eliminate thousands of warnings with gcc 3.2 build" (commit 57adc4d2dbf968fdbe516359688094eef4d46581) all WARN_ON()'s currently appear to come from warn_slowpath_null(), eg: WARNING: at kernel/softirq.c:143 warn_slowpath_null+0x1c/0x20() because now that warn_slowpath_null() is in the call path, the __builtin_return_address(0) returns that, rather than the place that caused the warning. Fix this by splitting up the warn_slowpath_null/fmt cases differently, using a common helper function, and getting the return address in the right place. This also happens to avoid the unnecessary stack usage for the non-stdargs case, and just generally cleans things up. Make the function name printout use %pS while at it. Cc: Ian Campbell Cc: Jesper Nilsson Cc: Johannes Weiner Cc: Arjan van de Ven Cc: Andi Kleen Cc: Hugh Dickins Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 874ecf1307ae..984b3ecbd72c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -340,39 +340,44 @@ void oops_exit(void) } #ifdef WANT_WARN_ON_SLOWPATH -void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) -{ +struct slowpath_args { + const char *fmt; va_list args; - char function[KSYM_SYMBOL_LEN]; - unsigned long caller = (unsigned long)__builtin_return_address(0); - const char *board; +}; - sprint_symbol(function, caller); +static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args) +{ + const char *board; printk(KERN_WARNING "------------[ cut here ]------------\n"); - printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, - line, function); + printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); board = dmi_get_system_info(DMI_PRODUCT_NAME); if (board) printk(KERN_WARNING "Hardware name: %s\n", board); - if (*fmt) { - va_start(args, fmt); - vprintk(fmt, args); - va_end(args); - } + if (args) + vprintk(args->fmt, args->args); print_modules(); dump_stack(); print_oops_end_marker(); add_taint(TAINT_WARN); } + +void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) +{ + struct slowpath_args args; + + args.fmt = fmt; + va_start(args.args, fmt); + warn_slowpath_common(file, line, __builtin_return_address(0), &args); + va_end(args.args); +} EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_null(const char *file, int line) { - static const char *empty = ""; - warn_slowpath_fmt(file, line, empty); + warn_slowpath_common(file, line, __builtin_return_address(0), NULL); } EXPORT_SYMBOL(warn_slowpath_null); #endif -- cgit v1.2.1