summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt15
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c23
-rw-r--r--kernel/audit.c48
-rw-r--r--kernel/audit_tree.c28
-rw-r--r--kernel/auditfilter.c15
-rw-r--r--kernel/auditsc.c35
-rw-r--r--kernel/cgroup.c211
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/exit.c114
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/futex.c58
-rw-r--r--kernel/futex_compat.c13
-rw-r--r--kernel/hrtimer.c48
-rw-r--r--kernel/irq/chip.c20
-rw-r--r--kernel/irq/spurious.c3
-rw-r--r--kernel/kgdb.c1700
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/kprobes.c52
-rw-r--r--kernel/lockdep.c8
-rw-r--r--kernel/marker.c704
-rw-r--r--kernel/module.c38
-rw-r--r--kernel/posix-timers.c8
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/disk.c4
-rw-r--r--kernel/power/process.c29
-rw-r--r--kernel/power/snapshot.c83
-rw-r--r--kernel/printk.c98
-rw-r--r--kernel/rcupdate.c5
-rw-r--r--kernel/rcupreempt.c233
-rw-r--r--kernel/relay.c12
-rw-r--r--kernel/res_counter.c1
-rw-r--r--kernel/rtmutex.c5
-rw-r--r--kernel/sched.c894
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c427
-rw-r--r--kernel/sched_rt.c112
-rw-r--r--kernel/semaphore.c264
-rw-r--r--kernel/signal.c89
-rw-r--r--kernel/softirq.c1
-rw-r--r--kernel/softlockup.c13
-rw-r--r--kernel/sysctl.c32
-rw-r--r--kernel/time/clocksource.c28
-rw-r--r--kernel/time/ntp.c23
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-common.c4
-rw-r--r--kernel/time/tick-oneshot.c2
-rw-r--r--kernel/time/tick-sched.c5
-rw-r--r--kernel/time/timekeeping.c10
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timeconst.pl2
-rw-r--r--kernel/timer.c10
-rw-r--r--kernel/uid16.c22
-rw-r--r--kernel/user.c50
54 files changed, 4316 insertions, 1324 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0669b70fa6a3..9fdba03dc1fc 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,8 +52,23 @@ config PREEMPT
endchoice
+config PREEMPT_RCU
+ bool "Preemptible RCU"
+ depends on PREEMPT
+ default n
+ help
+ This option reduces the latency of the kernel by making certain
+ RCU sections preemptible. Normally RCU code is non-preemptible, if
+ this option is selected then read-only RCU sections become
+ preemptible. This helps latency, but may expose bugs due to
+ now-naive assumptions about each RCU read-side critical section
+ remaining on a given CPU through its execution.
+
+ Say N if you are unsure.
+
config RCU_TRACE
bool "Enable tracing for RCU - currently stats in debugfs"
+ depends on PREEMPT_RCU
select DEBUG_FS
default y
help
diff --git a/kernel/Makefile b/kernel/Makefile
index 6c584c55a6e9..6c5f081132a4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o nsproxy.o srcu.o \
+ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o pm_qos_params.o
obj-$(CONFIG_SYSCTL) += sysctl_check.o
@@ -53,6 +53,7 @@ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 521dfa53cb99..91e1cfd734d2 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -58,6 +58,7 @@
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
+#include <linux/pid_namespace.h>
/*
* These constants control the amount of freespace that suspend and
@@ -74,7 +75,7 @@ int acct_parm[3] = {4, 2, 30};
/*
* External references and all of the globals.
*/
-static void do_acct_process(struct file *);
+static void do_acct_process(struct pid_namespace *ns, struct file *);
/*
* This structure is used so that all the data protected by lock
@@ -86,6 +87,7 @@ struct acct_glbs {
volatile int active;
volatile int needcheck;
struct file *file;
+ struct pid_namespace *ns;
struct timer_list timer;
};
@@ -175,9 +177,11 @@ out:
static void acct_file_reopen(struct file *file)
{
struct file *old_acct = NULL;
+ struct pid_namespace *old_ns = NULL;
if (acct_globals.file) {
old_acct = acct_globals.file;
+ old_ns = acct_globals.ns;
del_timer(&acct_globals.timer);
acct_globals.active = 0;
acct_globals.needcheck = 0;
@@ -185,6 +189,7 @@ static void acct_file_reopen(struct file *file)
}
if (file) {
acct_globals.file = file;
+ acct_globals.ns = get_pid_ns(task_active_pid_ns(current));
acct_globals.needcheck = 0;
acct_globals.active = 1;
/* It's been deleted if it was used before so this is safe */
@@ -196,8 +201,9 @@ static void acct_file_reopen(struct file *file)
if (old_acct) {
mnt_unpin(old_acct->f_path.mnt);
spin_unlock(&acct_globals.lock);
- do_acct_process(old_acct);
+ do_acct_process(old_ns, old_acct);
filp_close(old_acct, NULL);
+ put_pid_ns(old_ns);
spin_lock(&acct_globals.lock);
}
}
@@ -419,7 +425,7 @@ static u32 encode_float(u64 value)
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
-static void do_acct_process(struct file *file)
+static void do_acct_process(struct pid_namespace *ns, struct file *file)
{
struct pacct_struct *pacct = &current->signal->pacct;
acct_t ac;
@@ -481,8 +487,10 @@ static void do_acct_process(struct file *file)
ac.ac_gid16 = current->gid;
#endif
#if ACCT_VERSION==3
- ac.ac_pid = current->tgid;
- ac.ac_ppid = current->real_parent->tgid;
+ ac.ac_pid = task_tgid_nr_ns(current, ns);
+ rcu_read_lock();
+ ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
+ rcu_read_unlock();
#endif
spin_lock_irq(&current->sighand->siglock);
@@ -578,6 +586,7 @@ void acct_collect(long exitcode, int group_dead)
void acct_process(void)
{
struct file *file = NULL;
+ struct pid_namespace *ns;
/*
* accelerate the common fastpath:
@@ -592,8 +601,10 @@ void acct_process(void)
return;
}
get_file(file);
+ ns = get_pid_ns(acct_globals.ns);
spin_unlock(&acct_globals.lock);
- do_acct_process(file);
+ do_acct_process(ns, file);
fput(file);
+ put_pid_ns(ns);
}
diff --git a/kernel/audit.c b/kernel/audit.c
index c8555b180213..b782b046543d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -78,9 +78,13 @@ static int audit_default;
/* If auditing cannot proceed, audit_failure selects what happens. */
static int audit_failure = AUDIT_FAIL_PRINTK;
-/* If audit records are to be written to the netlink socket, audit_pid
- * contains the (non-zero) pid. */
+/*
+ * If audit records are to be written to the netlink socket, audit_pid
+ * contains the pid of the auditd process and audit_nlk_pid contains
+ * the pid to use to send netlink messages to that process.
+ */
int audit_pid;
+static int audit_nlk_pid;
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
@@ -170,7 +174,9 @@ void audit_panic(const char *message)
printk(KERN_ERR "audit: %s\n", message);
break;
case AUDIT_FAIL_PANIC:
- panic("audit: %s\n", message);
+ /* test audit_pid since printk is always losey, why bother? */
+ if (audit_pid)
+ panic("audit: %s\n", message);
break;
}
}
@@ -348,10 +354,11 @@ static int kauditd_thread(void *dummy)
wake_up(&audit_backlog_wait);
if (skb) {
if (audit_pid) {
- int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
+ int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0);
if (err < 0) {
BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+ audit_log_lost("auditd dissapeared\n");
audit_pid = 0;
}
} else {
@@ -623,6 +630,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
sid, 1);
audit_pid = new_pid;
+ audit_nlk_pid = NETLINK_CB(skb).pid;
}
if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
err = audit_set_rate_limit(status_get->rate_limit,
@@ -1261,8 +1269,8 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
/**
* audit_string_contains_control - does a string need to be logged in hex
- * @string - string to be checked
- * @len - max length of the string to check
+ * @string: string to be checked
+ * @len: max length of the string to check
*/
int audit_string_contains_control(const char *string, size_t len)
{
@@ -1277,7 +1285,7 @@ int audit_string_contains_control(const char *string, size_t len)
/**
* audit_log_n_untrustedstring - log a string that may contain random characters
* @ab: audit_buffer
- * @len: lenth of string (not including trailing null)
+ * @len: length of string (not including trailing null)
* @string: string to be logged
*
* This code will escape a string that is passed to it if the string
@@ -1312,26 +1320,26 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
- struct dentry *dentry, struct vfsmount *vfsmnt)
+ struct path *path)
{
- char *p, *path;
+ char *p, *pathname;
if (prefix)
audit_log_format(ab, " %s", prefix);
/* We will allow 11 spaces for ' (deleted)' to be appended */
- path = kmalloc(PATH_MAX+11, ab->gfp_mask);
- if (!path) {
+ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
+ if (!pathname) {
audit_log_format(ab, "<no memory>");
return;
}
- p = d_path(dentry, vfsmnt, path, PATH_MAX+11);
+ p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
audit_log_format(ab, "<too long>");
} else
audit_log_untrustedstring(ab, p);
- kfree(path);
+ kfree(pathname);
}
/**
@@ -1350,17 +1358,19 @@ void audit_log_end(struct audit_buffer *ab)
if (!audit_rate_check()) {
audit_log_lost("rate limit exceeded");
} else {
+ struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
if (audit_pid) {
- struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
skb_queue_tail(&audit_skb_queue, ab->skb);
ab->skb = NULL;
wake_up_interruptible(&kauditd_wait);
- } else if (printk_ratelimit()) {
- struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, ab->skb->data + NLMSG_SPACE(0));
- } else {
- audit_log_lost("printk limit exceeded\n");
+ } else if (nlh->nlmsg_type != AUDIT_EOE) {
+ if (printk_ratelimit()) {
+ printk(KERN_NOTICE "type=%d %s\n",
+ nlh->nlmsg_type,
+ ab->skb->data + NLMSG_SPACE(0));
+ } else
+ audit_log_lost("printk limit exceeded\n");
}
}
audit_buffer_free(ab);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f4fcf58f20f8..9ef5e0aacc3c 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -549,8 +549,8 @@ void audit_trim_trees(void)
if (err)
goto skip_it;
- root_mnt = collect_mounts(nd.mnt, nd.dentry);
- path_release(&nd);
+ root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
+ path_put(&nd.path);
if (!root_mnt)
goto skip_it;
@@ -583,17 +583,17 @@ skip_it:
static int is_under(struct vfsmount *mnt, struct dentry *dentry,
struct nameidata *nd)
{
- if (mnt != nd->mnt) {
+ if (mnt != nd->path.mnt) {
for (;;) {
if (mnt->mnt_parent == mnt)
return 0;
- if (mnt->mnt_parent == nd->mnt)
+ if (mnt->mnt_parent == nd->path.mnt)
break;
mnt = mnt->mnt_parent;
}
dentry = mnt->mnt_mountpoint;
}
- return is_subdir(dentry, nd->dentry);
+ return is_subdir(dentry, nd->path.dentry);
}
int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
@@ -641,8 +641,8 @@ int audit_add_tree_rule(struct audit_krule *rule)
err = path_lookup(tree->pathname, 0, &nd);
if (err)
goto Err;
- mnt = collect_mounts(nd.mnt, nd.dentry);
- path_release(&nd);
+ mnt = collect_mounts(nd.path.mnt, nd.path.dentry);
+ path_put(&nd.path);
if (!mnt) {
err = -ENOMEM;
goto Err;
@@ -701,8 +701,8 @@ int audit_tag_tree(char *old, char *new)
err = path_lookup(new, 0, &nd);
if (err)
return err;
- tagged = collect_mounts(nd.mnt, nd.dentry);
- path_release(&nd);
+ tagged = collect_mounts(nd.path.mnt, nd.path.dentry);
+ path_put(&nd.path);
if (!tagged)
return -ENOMEM;
@@ -711,9 +711,9 @@ int audit_tag_tree(char *old, char *new)
drop_collected_mounts(tagged);
return err;
}
- mnt = mntget(nd.mnt);
- dentry = dget(nd.dentry);
- path_release(&nd);
+ mnt = mntget(nd.path.mnt);
+ dentry = dget(nd.path.dentry);
+ path_put(&nd.path);
if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
follow_up(&mnt, &dentry);
@@ -744,13 +744,13 @@ int audit_tag_tree(char *old, char *new)
spin_lock(&vfsmount_lock);
if (!is_under(mnt, dentry, &nd)) {
spin_unlock(&vfsmount_lock);
- path_release(&nd);
+ path_put(&nd.path);
put_tree(tree);
mutex_lock(&audit_filter_mutex);
continue;
}
spin_unlock(&vfsmount_lock);
- path_release(&nd);
+ path_put(&nd.path);
list_for_each_entry(p, &list, mnt_list) {
failed = tag_chunk(p->mnt_root->d_inode, tree);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6f19fd477aac..2f2914b7cc30 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -169,8 +169,8 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp)
inotify_init_watch(&parent->wdata);
/* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
get_inotify_watch(&parent->wdata);
- wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode,
- AUDIT_IN_WATCH);
+ wd = inotify_add_watch(audit_ih, &parent->wdata,
+ ndp->path.dentry->d_inode, AUDIT_IN_WATCH);
if (wd < 0) {
audit_free_parent(&parent->wdata);
return ERR_PTR(wd);
@@ -1161,11 +1161,11 @@ static int audit_get_nd(char *path, struct nameidata **ndp,
static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
{
if (ndp) {
- path_release(ndp);
+ path_put(&ndp->path);
kfree(ndp);
}
if (ndw) {
- path_release(ndw);
+ path_put(&ndw->path);
kfree(ndw);
}
}
@@ -1214,8 +1214,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
/* update watch filter fields */
if (ndw) {
- watch->dev = ndw->dentry->d_inode->i_sb->s_dev;
- watch->ino = ndw->dentry->d_inode->i_ino;
+ watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
+ watch->ino = ndw->path.dentry->d_inode->i_ino;
}
/* The audit_filter_mutex must not be held during inotify calls because
@@ -1225,7 +1225,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
*/
mutex_unlock(&audit_filter_mutex);
- if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) {
+ if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode,
+ &i_watch) < 0) {
parent = audit_init_parent(ndp);
if (IS_ERR(parent)) {
/* caller expects mutex locked */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c06ecf38d7b..782262e4107d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -208,8 +208,7 @@ struct audit_context {
int name_count;
struct audit_names names[AUDIT_NAMES];
char * filterkey; /* key for rule that triggered record */
- struct dentry * pwd;
- struct vfsmount * pwdmnt;
+ struct path pwd;
struct audit_context *previous; /* For nested syscalls */
struct audit_aux_data *aux;
struct audit_aux_data *aux_pids;
@@ -786,12 +785,9 @@ static inline void audit_free_names(struct audit_context *context)
__putname(context->names[i].name);
}
context->name_count = 0;
- if (context->pwd)
- dput(context->pwd);
- if (context->pwdmnt)
- mntput(context->pwdmnt);
- context->pwd = NULL;
- context->pwdmnt = NULL;
+ path_put(&context->pwd);
+ context->pwd.dentry = NULL;
+ context->pwd.mnt = NULL;
}
static inline void audit_free_aux(struct audit_context *context)
@@ -930,8 +926,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
if ((vma->vm_flags & VM_EXECUTABLE) &&
vma->vm_file) {
audit_log_d_path(ab, "exe=",
- vma->vm_file->f_path.dentry,
- vma->vm_file->f_path.mnt);
+ &vma->vm_file->f_path);
break;
}
vma = vma->vm_next;
@@ -1005,9 +1000,10 @@ static int audit_log_single_execve_arg(struct audit_context *context,
* for strings that are too long, we should not have created
* any.
*/
- if (unlikely((len = -1) || len > MAX_ARG_STRLEN - 1)) {
+ if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) {
WARN_ON(1);
send_sig(SIGKILL, current, 0);
+ return -1;
}
/* walk the whole argument looking for non-ascii chars */
@@ -1025,6 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
if (ret) {
WARN_ON(1);
send_sig(SIGKILL, current, 0);
+ return -1;
}
buf[to_send] = '\0';
has_cntl = audit_string_contains_control(buf, to_send);
@@ -1073,7 +1070,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
* so we can be sure nothing was lost.
*/
if ((i == 0) && (too_long))
- audit_log_format(*ab, "a%d_len=%ld ", arg_num,
+ audit_log_format(*ab, "a%d_len=%zu ", arg_num,
has_cntl ? 2*len : len);
/*
@@ -1088,6 +1085,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
if (ret) {
WARN_ON(1);
send_sig(SIGKILL, current, 0);
+ return -1;
}
buf[to_send] = '\0';
@@ -1341,10 +1339,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
context->target_sid, context->target_comm))
call_panic = 1;
- if (context->pwd && context->pwdmnt) {
+ if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
- audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
+ audit_log_d_path(ab, "cwd=", &context->pwd);
audit_log_end(ab);
}
}
@@ -1367,8 +1365,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
case 0:
/* name was specified as a relative path and the
* directory component is the cwd */
- audit_log_d_path(ab, " name=", context->pwd,
- context->pwdmnt);
+ audit_log_d_path(ab, " name=", &context->pwd);
break;
default:
/* log the name's directory component */
@@ -1695,10 +1692,10 @@ void __audit_getname(const char *name)
context->names[context->name_count].ino = (unsigned long)-1;
context->names[context->name_count].osid = 0;
++context->name_count;
- if (!context->pwd) {
+ if (!context->pwd.dentry) {
read_lock(&current->fs->lock);
- context->pwd = dget(current->fs->pwd);
- context->pwdmnt = mntget(current->fs->pwdmnt);
+ context->pwd = current->fs->pwd;
+ path_get(&current->fs->pwd);
read_unlock(&current->fs->lock);
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4766bb65e4d9..6d8de051382b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -113,9 +113,9 @@ static int root_count;
#define dummytop (&rootnode.top_cgroup)
/* This flag indicates whether tasks in the fork and exit paths should
- * take callback_mutex and check for fork/exit handlers to call. This
- * avoids us having to do extra work in the fork/exit path if none of the
- * subsystems need to be called.
+ * check for fork/exit handlers to call. This avoids us having to do
+ * extra work in the fork/exit path if none of the subsystems need to
+ * be called.
*/
static int need_forkexit_callback;
@@ -307,7 +307,6 @@ static inline void put_css_set_taskexit(struct css_set *cg)
* template: location in which to build the desired set of subsystem
* state objects for the new cgroup group
*/
-
static struct css_set *find_existing_css_set(
struct css_set *oldcg,
struct cgroup *cgrp,
@@ -320,7 +319,7 @@ static struct css_set *find_existing_css_set(
/* Built the set of subsystem state objects that we want to
* see in the new css_set */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- if (root->subsys_bits & (1ull << i)) {
+ if (root->subsys_bits & (1UL << i)) {
/* Subsystem is in this hierarchy. So we want
* the subsystem state from the new
* cgroup */
@@ -354,7 +353,6 @@ static struct css_set *find_existing_css_set(
* and chains them on tmp through their cgrp_link_list fields. Returns 0 on
* success or a negative error
*/
-
static int allocate_cg_links(int count, struct list_head *tmp)
{
struct cg_cgroup_link *link;
@@ -396,7 +394,6 @@ static void free_cg_links(struct list_head *tmp)
* substituted into the appropriate hierarchy. Must be called with
* cgroup_mutex held
*/
-
static struct css_set *find_css_set(
struct css_set *oldcg, struct cgroup *cgrp)
{
@@ -473,7 +470,6 @@ static struct css_set *find_css_set(
/* Link this cgroup group into the list */
list_add(&res->list, &init_css_set.list);
css_set_count++;
- INIT_LIST_HEAD(&res->tasks);
write_unlock(&css_set_lock);
return res;
@@ -507,8 +503,8 @@ static struct css_set *find_css_set(
* critical pieces of code here. The exception occurs on cgroup_exit(),
* when a task in a notify_on_release cgroup exits. Then cgroup_mutex
* is taken, and if the cgroup count is zero, a usermode call made
- * to /sbin/cgroup_release_agent with the name of the cgroup (path
- * relative to the root of cgroup file system) as the argument.
+ * to the release agent with the name of the cgroup (path relative to
+ * the root of cgroup file system) as the argument.
*
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
@@ -521,7 +517,7 @@ static struct css_set *find_css_set(
*
* The need for this exception arises from the action of
* cgroup_attach_task(), which overwrites one tasks cgroup pointer with
- * another. It does so using cgroup_mutexe, however there are
+ * another. It does so using cgroup_mutex, however there are
* several performance critical places that need to reference
* task->cgroup without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
@@ -537,7 +533,6 @@ static struct css_set *find_css_set(
* cgroup_lock - lock out any changes to cgroup structures
*
*/
-
void cgroup_lock(void)
{
mutex_lock(&cgroup_mutex);
@@ -548,7 +543,6 @@ void cgroup_lock(void)
*
* Undo the lock taken in a previous cgroup_lock() call.
*/
-
void cgroup_unlock(void)
{
mutex_unlock(&cgroup_mutex);
@@ -590,7 +584,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
* Call subsys's pre_destroy handler.
* This is called before css refcnt check.
*/
-
static void cgroup_call_pre_destroy(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
@@ -600,7 +593,6 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
return;
}
-
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -696,7 +688,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
added_bits = final_bits & ~root->actual_subsys_bits;
/* Check that any added subsystems are currently free */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- unsigned long long bit = 1ull << i;
+ unsigned long bit = 1UL << i;
struct cgroup_subsys *ss = subsys[i];
if (!(bit & added_bits))
continue;
@@ -790,7 +782,14 @@ static int parse_cgroupfs_options(char *data,
if (!*token)
return -EINVAL;
if (!strcmp(token, "all")) {
- opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
+ /* Add all non-disabled subsystems */
+ int i;
+ opts->subsys_bits = 0;
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (!ss->disabled)
+ opts->subsys_bits |= 1ul << i;
+ }
} else if (!strcmp(token, "noprefix")) {
set_bit(ROOT_NOPREFIX, &opts->flags);
} else if (!strncmp(token, "release_agent=", 14)) {
@@ -808,7 +807,8 @@ static int parse_cgroupfs_options(char *data,
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
ss = subsys[i];
if (!strcmp(token, ss->name)) {
- set_bit(i, &opts->subsys_bits);
+ if (!ss->disabled)
+ set_bit(i, &opts->subsys_bits);
break;
}
}
@@ -927,7 +927,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
if (!inode)
return -ENOMEM;
- inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
inode->i_op = &cgroup_dir_inode_operations;
/* directories start off with i_nlink == 2 (for "." entry) */
@@ -961,8 +960,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
}
root = kzalloc(sizeof(*root), GFP_KERNEL);
- if (!root)
+ if (!root) {
+ if (opts.release_agent)
+ kfree(opts.release_agent);
return -ENOMEM;
+ }
init_cgroup_root(root);
root->subsys_bits = opts.subsys_bits;
@@ -1129,8 +1131,13 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
return dentry->d_fsdata;
}
-/*
- * Called with cgroup_mutex held. Writes path of cgroup into buf.
+/**
+ * cgroup_path - generate the path of a cgroup
+ * @cgrp: the cgroup in question
+ * @buf: the buffer to write the path into
+ * @buflen: the length of the buffer
+ *
+ * Called with cgroup_mutex held. Writes path of cgroup into buf.
* Returns 0 on success, -errno on error.
*/
int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
@@ -1188,11 +1195,13 @@ static void get_first_subsys(const struct cgroup *cgrp,
*subsys_id = test_ss->subsys_id;
}
-/*
- * Attach task 'tsk' to cgroup 'cgrp'
+/**
+ * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
+ * @cgrp: the cgroup the task is attaching to
+ * @tsk: the task to be attached
*
- * Call holding cgroup_mutex. May take task_lock of
- * the task 'pid' during call.
+ * Call holding cgroup_mutex. May take task_lock of
+ * the task 'tsk' during call.
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
@@ -1293,7 +1302,6 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
}
/* The various types of files and directories in a cgroup file system */
-
enum cgroup_filetype {
FILE_ROOT,
FILE_DIR,
@@ -1584,12 +1592,11 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
}
/*
- * cgroup_create_dir - create a directory for an object.
- * cgrp: the cgroup we create the directory for.
- * It must have a valid ->parent field
- * And we are going to fill its ->dentry field.
- * dentry: dentry of the new cgroup
- * mode: mode to set on new directory.
+ * cgroup_create_dir - create a directory for an object.
+ * @cgrp: the cgroup we create the directory for. It must have a valid
+ * ->parent field. And we are going to fill its ->dentry field.
+ * @dentry: dentry of the new cgroup
+ * @mode: mode to set on new directory.
*/
static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
int mode)
@@ -1651,8 +1658,12 @@ int cgroup_add_files(struct cgroup *cgrp,
return 0;
}
-/* Count the number of tasks in a cgroup. */
-
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ *
+ * Return the number of tasks in the cgroup.
+ */
int cgroup_task_count(const struct cgroup *cgrp)
{
int count = 0;
@@ -1711,7 +1722,12 @@ void cgroup_enable_task_cg_lists(void)
use_task_css_set_links = 1;
do_each_thread(g, p) {
task_lock(p);
- if (list_empty(&p->cg_list))
+ /*
+ * We should check if the process is exiting, otherwise
+ * it will race with cgroup_exit() in that the list
+ * entry won't be deleted though the process has exited.
+ */
+ if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
list_add(&p->cg_list, &p->cgroups->tasks);
task_unlock(p);
} while_each_thread(g, p);
@@ -1962,12 +1978,13 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
}
/**
- * Build and fill cgroupstats so that taskstats can export it to user
- * space.
- *
+ * cgroupstats_build - build and fill cgroupstats
* @stats: cgroupstats to fill information into
* @dentry: A dentry entry belonging to the cgroup for which stats have
* been requested.
+ *
+ * Build and fill cgroupstats so that taskstats can export it to user
+ * space.
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
@@ -2078,7 +2095,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
kfree(pidarray);
} else {
- ctr->buf = 0;
+ ctr->buf = NULL;
ctr->bufsz = 0;
}
file->private_data = ctr;
@@ -2199,14 +2216,13 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
}
/*
- * cgroup_create - create a cgroup
- * parent: cgroup that will be parent of the new cgroup.
- * name: name of the new cgroup. Will be strcpy'ed.
- * mode: mode to set on new inode
+ * cgroup_create - create a cgroup
+ * @parent: cgroup that will be parent of the new cgroup
+ * @dentry: dentry of the new cgroup
+ * @mode: mode to set on new inode
*
- * Must be called with the mutex on the parent inode held
+ * Must be called with the mutex on the parent inode held
*/
-
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
int mode)
{
@@ -2229,7 +2245,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
mutex_lock(&cgroup_mutex);
- cgrp->flags = 0;
INIT_LIST_HEAD(&cgrp->sibling);
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
@@ -2239,6 +2254,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
cgrp->root = parent->root;
cgrp->top_cgroup = parent->top_cgroup;
+ if (notify_on_release(parent))
+ set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+
for_each_subsys(root, ss) {
struct cgroup_subsys_state *css = ss->create(ss, cgrp);
if (IS_ERR(css)) {
@@ -2349,13 +2367,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
parent = cgrp->parent;
root = cgrp->root;
sb = root->sb;
+
/*
- * Call pre_destroy handlers of subsys
+ * Call pre_destroy handlers of subsys. Notify subsystems
+ * that rmdir() request comes.
*/
cgroup_call_pre_destroy(cgrp);
- /*
- * Notify subsyses that rmdir() request comes.
- */
if (cgroup_has_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
@@ -2431,8 +2448,10 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss)
}
/**
- * cgroup_init_early - initialize cgroups at system boot, and
- * initialize any subsystems that request early init.
+ * cgroup_init_early - cgroup initialization at system boot
+ *
+ * Initialize cgroups at system boot, and initialize any
+ * subsystems that request early init.
*/
int __init cgroup_init_early(void)
{
@@ -2474,8 +2493,10 @@ int __init cgroup_init_early(void)
}
/**
- * cgroup_init - register cgroup filesystem and /proc file, and
- * initialize any subsystems that didn't request early init.
+ * cgroup_init - cgroup initialization
+ *
+ * Register cgroup filesystem and /proc file, and initialize
+ * any subsystems that didn't request early init.
*/
int __init cgroup_init(void)
{
@@ -2553,6 +2574,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
/* Skip this hierarchy if it has no active subsystems */
if (!root->actual_subsys_bits)
continue;
+ seq_printf(m, "%lu:", root->subsys_bits);
for_each_subsys(root, ss)
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
seq_putc(m, ':');
@@ -2592,13 +2614,13 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
int i;
- seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n");
+ seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
mutex_lock(&cgroup_mutex);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
- seq_printf(m, "%s\t%lu\t%d\n",
+ seq_printf(m, "%s\t%lu\t%d\t%d\n",
ss->name, ss->root->subsys_bits,
- ss->root->number_of_cgroups);
+ ss->root->number_of_cgroups, !ss->disabled);
}
mutex_unlock(&cgroup_mutex);
return 0;
@@ -2606,7 +2628,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
static int cgroupstats_open(struct inode *inode, struct file *file)
{
- return single_open(file, proc_cgroupstats_show, 0);
+ return single_open(file, proc_cgroupstats_show, NULL);
}
static struct file_operations proc_cgroupstats_operations = {
@@ -2618,7 +2640,7 @@ static struct file_operations proc_cgroupstats_operations = {
/**
* cgroup_fork - attach newly forked task to its parents cgroup.
- * @tsk: pointer to task_struct of forking parent process.
+ * @child: pointer to task_struct of forking parent process.
*
* Description: A task inherits its parent's cgroup at fork().
*
@@ -2642,9 +2664,12 @@ void cgroup_fork(struct task_struct *child)
}
/**
- * cgroup_fork_callbacks - called on a new task very soon before
- * adding it to the tasklist. No need to take any locks since no-one
- * can be operating on this task
+ * cgroup_fork_callbacks - run fork callbacks
+ * @child: the new task
+ *
+ * Called on a new task very soon before adding it to the
+ * tasklist. No need to take any locks since no-one can
+ * be operating on this task.
*/
void cgroup_fork_callbacks(struct task_struct *child)
{
@@ -2659,11 +2684,14 @@ void cgroup_fork_callbacks(struct task_struct *child)
}
/**
- * cgroup_post_fork - called on a new task after adding it to the
- * task list. Adds the task to the list running through its css_set
- * if necessary. Has to be after the task is visible on the task list
- * in case we race with the first call to cgroup_iter_start() - to
- * guarantee that the new task ends up on its list. */
+ * cgroup_post_fork - called on a new task after adding it to the task list
+ * @child: the task in question
+ *
+ * Adds the task to the list running through its css_set if necessary.
+ * Has to be after the task is visible on the task list in case we race
+ * with the first call to cgroup_iter_start() - to guarantee that the
+ * new task ends up on its list.
+ */
void cgroup_post_fork(struct task_struct *child)
{
if (use_task_css_set_links) {
@@ -2676,6 +2704,7 @@ void cgroup_post_fork(struct task_struct *child)
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
+ * @run_callback: run exit callbacks?
*
* Description: Detach cgroup from @tsk and release it.
*
@@ -2706,7 +2735,6 @@ void cgroup_post_fork(struct task_struct *child)
* top_cgroup isn't going away, and either task has PF_EXITING set,
* which wards off any cgroup_attach_task() attempts, or task is a failed
* fork, never visible to cgroup_attach_task.
- *
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
@@ -2743,9 +2771,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
}
/**
- * cgroup_clone - duplicate the current cgroup in the hierarchy
- * that the given subsystem is attached to, and move this task into
- * the new child
+ * cgroup_clone - clone the cgroup the given subsystem is attached to
+ * @tsk: the task to be moved
+ * @subsys: the given subsystem
+ *
+ * Duplicate the current cgroup in the hierarchy that the given
+ * subsystem is attached to, and move this task into the new
+ * child.
*/
int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
{
@@ -2858,9 +2890,12 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
return ret;
}
-/*
- * See if "cgrp" is a descendant of the current task's cgroup in
- * the appropriate hierarchy
+/**
+ * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
+ * @cgrp: the cgroup in question
+ *
+ * See if @cgrp is a descendant of the current task's cgroup in
+ * the appropriate hierarchy.
*
* If we are sending in dummytop, then presumably we are creating
* the top cgroup in the subsystem.
@@ -2939,9 +2974,7 @@ void __css_put(struct cgroup_subsys_state *css)
* release agent task. We don't bother to wait because the caller of
* this routine has no use for the exit status of the release agent
* task, so no sense holding our caller up for that.
- *
*/
-
static void cgroup_release_agent(struct work_struct *work)
{
BUG_ON(work != &release_agent_work);
@@ -2991,3 +3024,27 @@ static void cgroup_release_agent(struct work_struct *work)
spin_unlock(&release_list_lock);
mutex_unlock(&cgroup_mutex);
}
+
+static int __init cgroup_disable(char *str)
+{
+ int i;
+ char *token;
+
+ while ((token = strsep(&str, ",")) != NULL) {
+ if (!*token)
+ continue;
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+
+ if (!strcmp(token, ss->name)) {
+ ss->disabled = 1;
+ printk(KERN_INFO "Disabling %s control group"
+ " subsystem\n", ss->name);
+ break;
+ }
+ }
+ }
+ return 1;
+}
+__setup("cgroup_disable=", cgroup_disable);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e296ed81d4d..a1b61f414228 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -322,8 +322,8 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
* Call without callback_mutex or task_lock() held. May be
* called with or without cgroup_mutex held. Thanks in part to
* 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL. This routine also might acquire callback_mutex and
- * current->mm->mmap_sem during call.
+ * be NULL. This routine also might acquire callback_mutex during
+ * call.
*
* Reading current->cpuset->mems_generation doesn't need task_lock
* to guard the current->cpuset derefence, because it is guarded
diff --git a/kernel/exit.c b/kernel/exit.c
index 3b893e78ce61..073005b1cfb2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -214,20 +214,19 @@ struct pid *session_of_pgrp(struct pid *pgrp)
static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
{
struct task_struct *p;
- int ret = 1;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
- if (p == ignored_task
- || p->exit_state
- || is_global_init(p->real_parent))
+ if ((p == ignored_task) ||
+ (p->exit_state && thread_group_empty(p)) ||
+ is_global_init(p->real_parent))
continue;
+
if (task_pgrp(p->real_parent) != pgrp &&
- task_session(p->real_parent) == task_session(p)) {
- ret = 0;
- break;
- }
+ task_session(p->real_parent) == task_session(p))
+ return 0;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
- return ret; /* (sighing) "Often!" */
+
+ return 1;
}
int is_current_pgrp_orphaned(void)
@@ -255,6 +254,37 @@ static int has_stopped_jobs(struct pid *pgrp)
return retval;
}
+/*
+ * Check to see if any process groups have become orphaned as
+ * a result of our exiting, and if they have any stopped jobs,
+ * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+static void
+kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
+{
+ struct pid *pgrp = task_pgrp(tsk);
+ struct task_struct *ignored_task = tsk;
+
+ if (!parent)
+ /* exit: our father is in a different pgrp than
+ * we are and we were the only connection outside.
+ */
+ parent = tsk->real_parent;
+ else
+ /* reparent: our child is in a different pgrp than
+ * we are, and it was the only connection outside.
+ */
+ ignored_task = NULL;
+
+ if (task_pgrp(parent) != pgrp &&
+ task_session(parent) == task_session(tsk) &&
+ will_become_orphaned_pgrp(pgrp, ignored_task) &&
+ has_stopped_jobs(pgrp)) {
+ __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
+ __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
+ }
+}
+
/**
* reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
*
@@ -512,14 +542,10 @@ static void __put_fs_struct(struct fs_struct *fs)
{
/* No need to hold fs->lock if we are killing it */
if (atomic_dec_and_test(&fs->count)) {
- dput(fs->root);
- mntput(fs->rootmnt);
- dput(fs->pwd);
- mntput(fs->pwdmnt);
- if (fs->altroot) {
- dput(fs->altroot);
- mntput(fs->altrootmnt);
- }
+ path_put(&fs->root);
+ path_put(&fs->pwd);
+ if (fs->altroot.dentry)
+ path_put(&fs->altroot);
kmem_cache_free(fs_cachep, fs);
}
}
@@ -639,22 +665,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
p->exit_signal != -1 && thread_group_empty(p))
do_notify_parent(p, p->exit_signal);
- /*
- * process group orphan check
- * Case ii: Our child is in a different pgrp
- * than we are, and it was the only connection
- * outside, so the child pgrp is now orphaned.
- */
- if ((task_pgrp(p) != task_pgrp(father)) &&
- (task_session(p) == task_session(father))) {
- struct pid *pgrp = task_pgrp(p);
-
- if (will_become_orphaned_pgrp(pgrp, NULL) &&
- has_stopped_jobs(pgrp)) {
- __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
- __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
- }
- }
+ kill_orphaned_pgrp(p, father);
}
/*
@@ -739,11 +750,9 @@ static void forget_original_parent(struct task_struct *father)
* Send signals to all our closest relatives so that they know
* to properly mourn us..
*/
-static void exit_notify(struct task_struct *tsk)
+static void exit_notify(struct task_struct *tsk, int group_dead)
{
int state;
- struct task_struct *t;
- struct pid *pgrp;
/*
* This does two things:
@@ -757,25 +766,8 @@ static void exit_notify(struct task_struct *tsk)
exit_task_namespaces(tsk);
write_lock_irq(&tasklist_lock);
- /*
- * Check to see if any process groups have become orphaned
- * as a result of our exiting, and if they have any stopped
- * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
- *
- * Case i: Our father is in a different pgrp than we are
- * and we were the only connection outside, so our pgrp
- * is about to become orphaned.
- */
- t = tsk->real_parent;
-
- pgrp = task_pgrp(tsk);
- if ((task_pgrp(t) != pgrp) &&
- (task_session(t) == task_session(tsk)) &&
- will_become_orphaned_pgrp(pgrp, tsk) &&
- has_stopped_jobs(pgrp)) {
- __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
- __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
- }
+ if (group_dead)
+ kill_orphaned_pgrp(tsk->group_leader, NULL);
/* Let father know we died
*
@@ -792,8 +784,8 @@ static void exit_notify(struct task_struct *tsk)
* the same after a fork.
*/
if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
- ( tsk->parent_exec_id != t->self_exec_id ||
- tsk->self_exec_id != tsk->parent_exec_id)
+ (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
+ tsk->self_exec_id != tsk->parent_exec_id)
&& !capable(CAP_KILL))
tsk->exit_signal = SIGCHLD;
@@ -990,7 +982,7 @@ NORET_TYPE void do_exit(long code)
module_put(tsk->binfmt->module);
proc_exit_connector(tsk);
- exit_notify(tsk);
+ exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
mpol_free(tsk->mempolicy);
tsk->mempolicy = NULL;
@@ -1386,7 +1378,7 @@ unlock_sig:
if (!retval && infop)
retval = put_user(0, &infop->si_errno);
if (!retval && infop)
- retval = put_user(why, &infop->si_code);
+ retval = put_user((short)why, &infop->si_code);
if (!retval && infop)
retval = put_user(exit_code, &infop->si_status);
if (!retval && infop)
@@ -1616,7 +1608,7 @@ asmlinkage long sys_waitid(int which, pid_t upid,
put_pid(pid);
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(5, ret, which, upid, infop, options, ru);
return ret;
}
@@ -1648,7 +1640,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
put_pid(pid);
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 4363a4eb84e3..9c042f901570 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -394,7 +394,6 @@ void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
mm_free_pgd(mm);
- mm_free_cgroup(mm);
destroy_context(mm);
free_mm(mm);
}
@@ -416,6 +415,7 @@ void mmput(struct mm_struct *mm)
spin_unlock(&mmlist_lock);
}
put_swap_token(mm);
+ mm_free_cgroup(mm);
mmdrop(mm);
}
}
@@ -600,16 +600,16 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
rwlock_init(&fs->lock);
fs->umask = old->umask;
read_lock(&old->lock);
- fs->rootmnt = mntget(old->rootmnt);
- fs->root = dget(old->root);
- fs->pwdmnt = mntget(old->pwdmnt);
- fs->pwd = dget(old->pwd);
- if (old->altroot) {
- fs->altrootmnt = mntget(old->altrootmnt);
- fs->altroot = dget(old->altroot);
+ fs->root = old->root;
+ path_get(&old->root);
+ fs->pwd = old->pwd;
+ path_get(&old->pwd);
+ if (old->altroot.dentry) {
+ fs->altroot = old->altroot;
+ path_get(&old->altroot);
} else {
- fs->altrootmnt = NULL;
- fs->altroot = NULL;
+ fs->altroot.mnt = NULL;
+ fs->altroot.dentry = NULL;
}
read_unlock(&old->lock);
}
diff --git a/kernel/futex.c b/kernel/futex.c
index a6baaec44b8f..e43945e995f5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -60,6 +60,8 @@
#include "rtmutex_common.h"
+int __read_mostly futex_cmpxchg_enabled;
+
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
@@ -279,7 +281,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
*/
static void get_futex_key_refs(union futex_key *key)
{
- if (key->both.ptr == 0)
+ if (key->both.ptr == NULL)
return;
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
@@ -469,6 +471,8 @@ void exit_pi_state_list(struct task_struct *curr)
struct futex_hash_bucket *hb;
union futex_key key;
+ if (!futex_cmpxchg_enabled)
+ return;
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
@@ -1870,6 +1874,8 @@ asmlinkage long
sys_set_robust_list(struct robust_list_head __user *head,
size_t len)
{
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
/*
* The kernel knows only one size for now:
*/
@@ -1894,6 +1900,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
struct robust_list_head __user *head;
unsigned long ret;
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
if (!pid)
head = current->robust_list;
else {
@@ -1997,6 +2006,9 @@ void exit_robust_list(struct task_struct *curr)
unsigned long futex_offset;
int rc;
+ if (!futex_cmpxchg_enabled)
+ return;
+
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
@@ -2051,7 +2063,7 @@ void exit_robust_list(struct task_struct *curr)
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
- int ret;
+ int ret = -ENOSYS;
int cmd = op & FUTEX_CMD_MASK;
struct rw_semaphore *fshared = NULL;
@@ -2083,13 +2095,16 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
break;
case FUTEX_LOCK_PI:
- ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
+ if (futex_cmpxchg_enabled)
+ ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
break;
case FUTEX_UNLOCK_PI:
- ret = futex_unlock_pi(uaddr, fshared);
+ if (futex_cmpxchg_enabled)
+ ret = futex_unlock_pi(uaddr, fshared);
break;
case FUTEX_TRYLOCK_PI:
- ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
+ if (futex_cmpxchg_enabled)
+ ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
break;
default:
ret = -ENOSYS;
@@ -2116,7 +2131,7 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
t = timespec_to_ktime(ts);
if (cmd == FUTEX_WAIT)
- t = ktime_add(ktime_get(), t);
+ t = ktime_add_safe(ktime_get(), t);
tp = &t;
}
/*
@@ -2143,10 +2158,31 @@ static struct file_system_type futex_fs_type = {
.kill_sb = kill_anon_super,
};
-static int __init init(void)
+static int __init futex_init(void)
{
- int i = register_filesystem(&futex_fs_type);
+ u32 curval;
+ int i;
+
+ /*
+ * This will fail and we want it. Some arch implementations do
+ * runtime detection of the futex_atomic_cmpxchg_inatomic()
+ * functionality. We want to know that before we call in any
+ * of the complex code paths. Also we want to prevent
+ * registration of robust lists in that case. NULL is
+ * guaranteed to fault and we get -EFAULT on functional
+ * implementation, the non functional ones will return
+ * -ENOSYS.
+ */
+ curval = cmpxchg_futex_value_locked(NULL, 0, 0);
+ if (curval == -EFAULT)
+ futex_cmpxchg_enabled = 1;
+ for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
+ plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
+ spin_lock_init(&futex_queues[i].lock);
+ }
+
+ i = register_filesystem(&futex_fs_type);
if (i)
return i;
@@ -2156,10 +2192,6 @@ static int __init init(void)
return PTR_ERR(futex_mnt);
}
- for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
- plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
- spin_lock_init(&futex_queues[i].lock);
- }
return 0;
}
-__initcall(init);
+__initcall(futex_init);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 133d558db452..04ac3a9e42cf 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -30,7 +30,7 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
return 0;
}
-static void __user *futex_uaddr(struct robust_list *entry,
+static void __user *futex_uaddr(struct robust_list __user *entry,
compat_long_t futex_offset)
{
compat_uptr_t base = ptr_to_compat(entry);
@@ -54,6 +54,9 @@ void compat_exit_robust_list(struct task_struct *curr)
compat_long_t futex_offset;
int rc;
+ if (!futex_cmpxchg_enabled)
+ return;
+
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
@@ -115,6 +118,9 @@ asmlinkage long
compat_sys_set_robust_list(struct compat_robust_list_head __user *head,
compat_size_t len)
{
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
if (unlikely(len != sizeof(*head)))
return -EINVAL;
@@ -130,6 +136,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
struct compat_robust_list_head __user *head;
unsigned long ret;
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
if (!pid)
head = current->compat_robust_list;
else {
@@ -176,7 +185,7 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
t = timespec_to_ktime(ts);
if (cmd == FUTEX_WAIT)
- t = ktime_add(ktime_get(), t);
+ t = ktime_add_safe(ktime_get(), t);
tp = &t;
}
if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3f4a57c7895d..98bee013f71f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -326,6 +326,23 @@ u64 ktime_divns(const ktime_t kt, s64 div)
#endif /* BITS_PER_LONG >= 64 */
/*
+ * Add two ktime values and do a safety check for overflow:
+ */
+ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
+{
+ ktime_t res = ktime_add(lhs, rhs);
+
+ /*
+ * We use KTIME_SEC_MAX here, the maximum timeout which we can
+ * return to user space in a timespec:
+ */
+ if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64)
+ res = ktime_set(KTIME_SEC_MAX, 0);
+
+ return res;
+}
+
+/*
* Check, whether the timer is on the callback pending list
*/
static inline int hrtimer_cb_pending(const struct hrtimer *timer)
@@ -425,6 +442,8 @@ static int hrtimer_reprogram(struct hrtimer *timer,
ktime_t expires = ktime_sub(timer->expires, base->offset);
int res;
+ WARN_ON_ONCE(timer->expires.tv64 < 0);
+
/*
* When the callback is running, we do not reprogram the clock event
* device. The timer callback is either running on a different CPU or
@@ -435,6 +454,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
if (hrtimer_callback_running(timer))
return 0;
+ /*
+ * CLOCK_REALTIME timer might be requested with an absolute
+ * expiry time which is less than base->offset. Nothing wrong
+ * about that, just avoid to call into the tick code, which
+ * has now objections against negative expiry values.
+ */
+ if (expires.tv64 < 0)
+ return -ETIME;
+
if (expires.tv64 >= expires_next->tv64)
return 0;
@@ -682,13 +710,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
*/
orun++;
}
- timer->expires = ktime_add(timer->expires, interval);
- /*
- * Make sure, that the result did not wrap with a very large
- * interval.
- */
- if (timer->expires.tv64 < 0)
- timer->expires = ktime_set(KTIME_SEC_MAX, 0);
+ timer->expires = ktime_add_safe(timer->expires, interval);
return orun;
}
@@ -839,7 +861,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
new_base = switch_hrtimer_base(timer, base);
if (mode == HRTIMER_MODE_REL) {
- tim = ktime_add(tim, new_base->get_time());
+ tim = ktime_add_safe(tim, new_base->get_time());
/*
* CONFIG_TIME_LOW_RES is a temporary way for architectures
* to signal that they simply return xtime in
@@ -848,16 +870,8 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
* timeouts. This will go away with the GTOD framework.
*/
#ifdef CONFIG_TIME_LOW_RES
- tim = ktime_add(tim, base->resolution);
+ tim = ktime_add_safe(tim, base->resolution);
#endif
- /*
- * Careful here: User space might have asked for a
- * very long sleep, so the add above might result in a
- * negative number, which enqueues the timer in front
- * of the queue.
- */
- if (tim.tv64 < 0)
- tim.tv64 = KTIME_MAX;
}
timer->expires = tim;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index cc54c6276356..fdb3fbe2b0c4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -246,6 +246,17 @@ static unsigned int default_startup(unsigned int irq)
}
/*
+ * default shutdown function
+ */
+static void default_shutdown(unsigned int irq)
+{
+ struct irq_desc *desc = irq_desc + irq;
+
+ desc->chip->mask(irq);
+ desc->status |= IRQ_MASKED;
+}
+
+/*
* Fixup enable/disable function pointers
*/
void irq_chip_set_defaults(struct irq_chip *chip)
@@ -256,8 +267,15 @@ void irq_chip_set_defaults(struct irq_chip *chip)
chip->disable = default_disable;
if (!chip->startup)
chip->startup = default_startup;
+ /*
+ * We use chip->disable, when the user provided its own. When
+ * we have default_disable set for chip->disable, then we need
+ * to use default_shutdown, otherwise the irq line is not
+ * disabled on free_irq():
+ */
if (!chip->shutdown)
- chip->shutdown = chip->disable;
+ chip->shutdown = chip->disable != default_disable ?
+ chip->disable : default_shutdown;
if (!chip->name)
chip->name = chip->typename;
if (!chip->end)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index a6b2bc831dd0..088dabbf2d6a 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -6,6 +6,7 @@
* This file contains spurious interrupt handling.
*/
+#include <linux/jiffies.h>
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
@@ -179,7 +180,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
* otherwise the couter becomes a doomsday timer for otherwise
* working systems
*/
- if (jiffies - desc->last_unhandled > HZ/10)
+ if (time_after(jiffies, desc->last_unhandled + HZ/10))
desc->irqs_unhandled = 1;
else
desc->irqs_unhandled++;
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
new file mode 100644
index 000000000000..1bd0ec1c80b2
--- /dev/null
+++ b/kernel/kgdb.c
@@ -0,0 +1,1700 @@
+/*
+ * KGDB stub.
+ *
+ * Maintainer: Jason Wessel <jason.wessel@windriver.com>
+ *
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002-2004 Timesys Corporation
+ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
+ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2005-2008 Wind River Systems, Inc.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Contributors at various stages not listed above:
+ * Jason Wessel ( jason.wessel@windriver.com )
+ * George Anzinger <george@mvista.com>
+ * Anurekh Saxena (anurekh.saxena@timesys.com)
+ * Lake Stevens Instrument Division (Glenn Engel)
+ * Jim Kingdon, Cygnus Support.
+ *
+ * Original KGDB stub: David Grothe <dave@gcom.com>,
+ * Tigran Aivazian <tigran@sco.com>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2. This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+#include <linux/pid_namespace.h>
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/console.h>
+#include <linux/threads.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/sysrq.h>
+#include <linux/init.h>
+#include <linux/kgdb.h>
+#include <linux/pid.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
+#include <asm/cacheflush.h>
+#include <asm/byteorder.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+static int kgdb_break_asap;
+
+struct kgdb_state {
+ int ex_vector;
+ int signo;
+ int err_code;
+ int cpu;
+ int pass_exception;
+ long threadid;
+ long kgdb_usethreadid;
+ struct pt_regs *linux_regs;
+};
+
+static struct debuggerinfo_struct {
+ void *debuggerinfo;
+ struct task_struct *task;
+} kgdb_info[NR_CPUS];
+
+/**
+ * kgdb_connected - Is a host GDB connected to us?
+ */
+int kgdb_connected;
+EXPORT_SYMBOL_GPL(kgdb_connected);
+
+/* All the KGDB handlers are installed */
+static int kgdb_io_module_registered;
+
+/* Guard for recursive entry */
+static int exception_level;
+
+static struct kgdb_io *kgdb_io_ops;
+static DEFINE_SPINLOCK(kgdb_registration_lock);
+
+/* kgdb console driver is loaded */
+static int kgdb_con_registered;
+/* determine if kgdb console output should be used */
+static int kgdb_use_con;
+
+static int __init opt_kgdb_con(char *str)
+{
+ kgdb_use_con = 1;
+ return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
+module_param(kgdb_use_con, int, 0644);
+
+/*
+ * Holds information about breakpoints in a kernel. These breakpoints are
+ * added and removed by gdb.
+ */
+static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
+ [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
+};
+
+/*
+ * The CPU# of the active CPU, or -1 if none:
+ */
+atomic_t kgdb_active = ATOMIC_INIT(-1);
+
+/*
+ * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
+ * bootup code (which might not have percpu set up yet):
+ */
+static atomic_t passive_cpu_wait[NR_CPUS];
+static atomic_t cpu_in_kgdb[NR_CPUS];
+atomic_t kgdb_setting_breakpoint;
+
+struct task_struct *kgdb_usethread;
+struct task_struct *kgdb_contthread;
+
+int kgdb_single_step;
+
+/* Our I/O buffers. */
+static char remcom_in_buffer[BUFMAX];
+static char remcom_out_buffer[BUFMAX];
+
+/* Storage for the registers, in GDB format. */
+static unsigned long gdb_regs[(NUMREGBYTES +
+ sizeof(unsigned long) - 1) /
+ sizeof(unsigned long)];
+
+/* to keep track of the CPU which is doing the single stepping*/
+atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
+
+/*
+ * If you are debugging a problem where roundup (the collection of
+ * all other CPUs) is a problem [this should be extremely rare],
+ * then use the nokgdbroundup option to avoid roundup. In that case
+ * the other CPUs might interfere with your debugging context, so
+ * use this with care:
+ */
+int kgdb_do_roundup = 1;
+
+static int __init opt_nokgdbroundup(char *str)
+{
+ kgdb_do_roundup = 0;
+
+ return 0;
+}
+
+early_param("nokgdbroundup", opt_nokgdbroundup);
+
+/*
+ * Finally, some KGDB code :-)
+ */
+
+/*
+ * Weak aliases for breakpoint management,
+ * can be overriden by architectures when needed:
+ */
+int __weak kgdb_validate_break_address(unsigned long addr)
+{
+ char tmp_variable[BREAK_INSTR_SIZE];
+
+ return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
+{
+ int err;
+
+ err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
+ if (err)
+ return err;
+
+ return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
+ BREAK_INSTR_SIZE);
+}
+
+int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
+{
+ return probe_kernel_write((char *)addr,
+ (char *)bundle, BREAK_INSTR_SIZE);
+}
+
+unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+ return instruction_pointer(regs);
+}
+
+int __weak kgdb_arch_init(void)
+{
+ return 0;
+}
+
+int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+ return 0;
+}
+
+void __weak
+kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
+{
+ return;
+}
+
+/**
+ * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ * @regs: Current &struct pt_regs.
+ *
+ * This function will be called if the particular architecture must
+ * disable hardware debugging while it is processing gdb packets or
+ * handling exception.
+ */
+void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
+{
+}
+
+/*
+ * GDB remote protocol parser:
+ */
+
+static const char hexchars[] = "0123456789abcdef";
+
+static int hex(char ch)
+{
+ if ((ch >= 'a') && (ch <= 'f'))
+ return ch - 'a' + 10;
+ if ((ch >= '0') && (ch <= '9'))
+ return ch - '0';
+ if ((ch >= 'A') && (ch <= 'F'))
+ return ch - 'A' + 10;
+ return -1;
+}
+
+/* scan for the sequence $<data>#<checksum> */
+static void get_packet(char *buffer)
+{
+ unsigned char checksum;
+ unsigned char xmitcsum;
+ int count;
+ char ch;
+
+ do {
+ /*
+ * Spin and wait around for the start character, ignore all
+ * other characters:
+ */
+ while ((ch = (kgdb_io_ops->read_char())) != '$')
+ /* nothing */;
+
+ kgdb_connected = 1;
+ checksum = 0;
+ xmitcsum = -1;
+
+ count = 0;
+
+ /*
+ * now, read until a # or end of buffer is found:
+ */
+ while (count < (BUFMAX - 1)) {
+ ch = kgdb_io_ops->read_char();
+ if (ch == '#')
+ break;
+ checksum = checksum + ch;
+ buffer[count] = ch;
+ count = count + 1;
+ }
+ buffer[count] = 0;
+
+ if (ch == '#') {
+ xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
+ xmitcsum += hex(kgdb_io_ops->read_char());
+
+ if (checksum != xmitcsum)
+ /* failed checksum */
+ kgdb_io_ops->write_char('-');
+ else
+ /* successful transfer */
+ kgdb_io_ops->write_char('+');
+ if (kgdb_io_ops->flush)
+ kgdb_io_ops->flush();
+ }
+ } while (checksum != xmitcsum);
+}
+
+/*
+ * Send the packet in buffer.
+ * Check for gdb connection if asked for.
+ */
+static void put_packet(char *buffer)
+{
+ unsigned char checksum;
+ int count;
+ char ch;
+
+ /*
+ * $<packet info>#<checksum>.
+ */
+ while (1) {
+ kgdb_io_ops->write_char('$');
+ checksum = 0;
+ count = 0;
+
+ while ((ch = buffer[count])) {
+ kgdb_io_ops->write_char(ch);
+ checksum += ch;
+ count++;
+ }
+
+ kgdb_io_ops->write_char('#');
+ kgdb_io_ops->write_char(hexchars[checksum >> 4]);
+ kgdb_io_ops->write_char(hexchars[checksum & 0xf]);
+ if (kgdb_io_ops->flush)
+ kgdb_io_ops->flush();
+
+ /* Now see what we get in reply. */
+ ch = kgdb_io_ops->read_char();
+
+ if (ch == 3)
+ ch = kgdb_io_ops->read_char();
+
+ /* If we get an ACK, we are done. */
+ if (ch == '+')
+ return;
+
+ /*
+ * If we get the start of another packet, this means
+ * that GDB is attempting to reconnect. We will NAK
+ * the packet being sent, and stop trying to send this
+ * packet.
+ */
+ if (ch == '$') {
+ kgdb_io_ops->write_char('-');
+ if (kgdb_io_ops->flush)
+ kgdb_io_ops->flush();
+ return;
+ }
+ }
+}
+
+static char *pack_hex_byte(char *pkt, u8 byte)
+{
+ *pkt++ = hexchars[byte >> 4];
+ *pkt++ = hexchars[byte & 0xf];
+
+ return pkt;
+}
+
+/*
+ * Convert the memory pointed to by mem into hex, placing result in buf.
+ * Return a pointer to the last char put in buf (null). May return an error.
+ */
+int kgdb_mem2hex(char *mem, char *buf, int count)
+{
+ char *tmp;
+ int err;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory copy. Hex conversion will work against this one.
+ */
+ tmp = buf + count;
+
+ err = probe_kernel_read(tmp, mem, count);
+ if (!err) {
+ while (count > 0) {
+ buf = pack_hex_byte(buf, *tmp);
+ tmp++;
+ count--;
+ }
+
+ *buf = 0;
+ }
+
+ return err;
+}
+
+/*
+ * Copy the binary array pointed to by buf into mem. Fix $, #, and
+ * 0x7d escaped with 0x7d. Return a pointer to the character after
+ * the last byte written.
+ */
+static int kgdb_ebin2mem(char *buf, char *mem, int count)
+{
+ int err = 0;
+ char c;
+
+ while (count-- > 0) {
+ c = *buf++;
+ if (c == 0x7d)
+ c = *buf++ ^ 0x20;
+
+ err = probe_kernel_write(mem, &c, 1);
+ if (err)
+ break;
+
+ mem++;
+ }
+
+ return err;
+}
+
+/*
+ * Convert the hex array pointed to by buf into binary to be placed in mem.
+ * Return a pointer to the character AFTER the last byte written.
+ * May return an error.
+ */
+int kgdb_hex2mem(char *buf, char *mem, int count)
+{
+ char *tmp_raw;
+ char *tmp_hex;
+
+ /*
+ * We use the upper half of buf as an intermediate buffer for the
+ * raw memory that is converted from hex.
+ */
+ tmp_raw = buf + count * 2;
+
+ tmp_hex = tmp_raw - 1;
+ while (tmp_hex >= buf) {
+ tmp_raw--;
+ *tmp_raw = hex(*tmp_hex--);
+ *tmp_raw |= hex(*tmp_hex--) << 4;
+ }
+
+ return probe_kernel_write(mem, tmp_raw, count);
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int kgdb_hex2long(char **ptr, long *long_val)
+{
+ int hex_val;
+ int num = 0;
+
+ *long_val = 0;
+
+ while (**ptr) {
+ hex_val = hex(**ptr);
+ if (hex_val < 0)
+ break;
+
+ *long_val = (*long_val << 4) | hex_val;
+ num++;
+ (*ptr)++;
+ }
+
+ return num;
+}
+
+/* Write memory due to an 'M' or 'X' packet. */
+static int write_mem_msg(int binary)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long addr;
+ unsigned long length;
+ int err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
+ if (binary)
+ err = kgdb_ebin2mem(ptr, (char *)addr, length);
+ else
+ err = kgdb_hex2mem(ptr, (char *)addr, length);
+ if (err)
+ return err;
+ if (CACHE_FLUSH_IS_SAFE)
+ flush_icache_range(addr, addr + length + 1);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void error_packet(char *pkt, int error)
+{
+ error = -error;
+ pkt[0] = 'E';
+ pkt[1] = hexchars[(error / 10)];
+ pkt[2] = hexchars[(error % 10)];
+ pkt[3] = '\0';
+}
+
+/*
+ * Thread ID accessors. We represent a flat TID space to GDB, where
+ * the per CPU idle threads (which under Linux all have PID 0) are
+ * remapped to negative TIDs.
+ */
+
+#define BUF_THREAD_ID_SIZE 16
+
+static char *pack_threadid(char *pkt, unsigned char *id)
+{
+ char *limit;
+
+ limit = pkt + BUF_THREAD_ID_SIZE;
+ while (pkt < limit)
+ pkt = pack_hex_byte(pkt, *id++);
+
+ return pkt;
+}
+
+static void int_to_threadref(unsigned char *id, int value)
+{
+ unsigned char *scan;
+ int i = 4;
+
+ scan = (unsigned char *)id;
+ while (i--)
+ *scan++ = 0;
+ *scan++ = (value >> 24) & 0xff;
+ *scan++ = (value >> 16) & 0xff;
+ *scan++ = (value >> 8) & 0xff;
+ *scan++ = (value & 0xff);
+}
+
+static struct task_struct *getthread(struct pt_regs *regs, int tid)
+{
+ /*
+ * Non-positive TIDs are remapped idle tasks:
+ */
+ if (tid <= 0)
+ return idle_task(-tid);
+
+ /*
+ * find_task_by_pid_ns() does not take the tasklist lock anymore
+ * but is nicely RCU locked - hence is a pretty resilient
+ * thing to use:
+ */
+ return find_task_by_pid_ns(tid, &init_pid_ns);
+}
+
+/*
+ * CPU debug state control:
+ */
+
+#ifdef CONFIG_SMP
+static void kgdb_wait(struct pt_regs *regs)
+{
+ unsigned long flags;
+ int cpu;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ kgdb_info[cpu].debuggerinfo = regs;
+ kgdb_info[cpu].task = current;
+ /*
+ * Make sure the above info reaches the primary CPU before
+ * our cpu_in_kgdb[] flag setting does:
+ */
+ smp_wmb();
+ atomic_set(&cpu_in_kgdb[cpu], 1);
+
+ /* Wait till primary CPU is done with debugging */
+ while (atomic_read(&passive_cpu_wait[cpu]))
+ cpu_relax();
+
+ kgdb_info[cpu].debuggerinfo = NULL;
+ kgdb_info[cpu].task = NULL;
+
+ /* fix up hardware debug registers on local cpu */
+ if (arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+
+ /* Signal the primary CPU that we are done: */
+ atomic_set(&cpu_in_kgdb[cpu], 0);
+ clocksource_touch_watchdog();
+ local_irq_restore(flags);
+}
+#endif
+
+/*
+ * Some architectures need cache flushes when we set/clear a
+ * breakpoint:
+ */
+static void kgdb_flush_swbreak_addr(unsigned long addr)
+{
+ if (!CACHE_FLUSH_IS_SAFE)
+ return;
+
+ if (current->mm && current->mm->mmap_cache) {
+ flush_cache_range(current->mm->mmap_cache,
+ addr, addr + BREAK_INSTR_SIZE);
+ }
+ /* Force flush instruction cache if it was outside the mm */
+ flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
+}
+
+/*
+ * SW breakpoint management:
+ */
+static int kgdb_activate_sw_breakpoints(void)
+{
+ unsigned long addr;
+ int error = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_SET)
+ continue;
+
+ addr = kgdb_break[i].bpt_addr;
+ error = kgdb_arch_set_breakpoint(addr,
+ kgdb_break[i].saved_instr);
+ if (error)
+ return error;
+
+ kgdb_flush_swbreak_addr(addr);
+ kgdb_break[i].state = BP_ACTIVE;
+ }
+ return 0;
+}
+
+static int kgdb_set_sw_break(unsigned long addr)
+{
+ int err = kgdb_validate_break_address(addr);
+ int breakno = -1;
+ int i;
+
+ if (err)
+ return err;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return -EEXIST;
+ }
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_REMOVED &&
+ kgdb_break[i].bpt_addr == addr) {
+ breakno = i;
+ break;
+ }
+ }
+
+ if (breakno == -1) {
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state == BP_UNDEFINED) {
+ breakno = i;
+ break;
+ }
+ }
+ }
+
+ if (breakno == -1)
+ return -E2BIG;
+
+ kgdb_break[breakno].state = BP_SET;
+ kgdb_break[breakno].type = BP_BREAKPOINT;
+ kgdb_break[breakno].bpt_addr = addr;
+
+ return 0;
+}
+
+static int kgdb_deactivate_sw_breakpoints(void)
+{
+ unsigned long addr;
+ int error = 0;
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ continue;
+ addr = kgdb_break[i].bpt_addr;
+ error = kgdb_arch_remove_breakpoint(addr,
+ kgdb_break[i].saved_instr);
+ if (error)
+ return error;
+
+ kgdb_flush_swbreak_addr(addr);
+ kgdb_break[i].state = BP_SET;
+ }
+ return 0;
+}
+
+static int kgdb_remove_sw_break(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_SET) &&
+ (kgdb_break[i].bpt_addr == addr)) {
+ kgdb_break[i].state = BP_REMOVED;
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
+
+int kgdb_isremovedbreak(unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if ((kgdb_break[i].state == BP_REMOVED) &&
+ (kgdb_break[i].bpt_addr == addr))
+ return 1;
+ }
+ return 0;
+}
+
+int remove_all_break(void)
+{
+ unsigned long addr;
+ int error;
+ int i;
+
+ /* Clear memory breakpoints. */
+ for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
+ if (kgdb_break[i].state != BP_ACTIVE)
+ goto setundefined;
+ addr = kgdb_break[i].bpt_addr;
+ error = kgdb_arch_remove_breakpoint(addr,
+ kgdb_break[i].saved_instr);
+ if (error)
+ printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
+ addr);
+setundefined:
+ kgdb_break[i].state = BP_UNDEFINED;
+ }
+
+ /* Clear hardware breakpoints. */
+ if (arch_kgdb_ops.remove_all_hw_break)
+ arch_kgdb_ops.remove_all_hw_break();
+
+ return 0;
+}
+
+/*
+ * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs:
+ */
+static inline int shadow_pid(int realpid)
+{
+ if (realpid)
+ return realpid;
+
+ return -1-raw_smp_processor_id();
+}
+
+static char gdbmsgbuf[BUFMAX + 1];
+
+static void kgdb_msg_write(const char *s, int len)
+{
+ char *bufptr;
+ int wcount;
+ int i;
+
+ /* 'O'utput */
+ gdbmsgbuf[0] = 'O';
+
+ /* Fill and send buffers... */
+ while (len > 0) {
+ bufptr = gdbmsgbuf + 1;
+
+ /* Calculate how many this time */
+ if ((len << 1) > (BUFMAX - 2))
+ wcount = (BUFMAX - 2) >> 1;
+ else
+ wcount = len;
+
+ /* Pack in hex chars */
+ for (i = 0; i < wcount; i++)
+ bufptr = pack_hex_byte(bufptr, s[i]);
+ *bufptr = '\0';
+
+ /* Move up */
+ s += wcount;
+ len -= wcount;
+
+ /* Write packet */
+ put_packet(gdbmsgbuf);
+ }
+}
+
+/*
+ * Return true if there is a valid kgdb I/O module. Also if no
+ * debugger is attached a message can be printed to the console about
+ * waiting for the debugger to attach.
+ *
+ * The print_wait argument is only to be true when called from inside
+ * the core kgdb_handle_exception, because it will wait for the
+ * debugger to attach.
+ */
+static int kgdb_io_ready(int print_wait)
+{
+ if (!kgdb_io_ops)
+ return 0;
+ if (kgdb_connected)
+ return 1;
+ if (atomic_read(&kgdb_setting_breakpoint))
+ return 1;
+ if (print_wait)
+ printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
+ return 1;
+}
+
+/*
+ * All the functions that start with gdb_cmd are the various
+ * operations to implement the handlers for the gdbserial protocol
+ * where KGDB is communicating with an external debugger
+ */
+
+/* Handle the '?' status packets */
+static void gdb_cmd_status(struct kgdb_state *ks)
+{
+ /*
+ * We know that this packet is only sent
+ * during initial connect. So to be safe,
+ * we clear out our breakpoints now in case
+ * GDB is reconnecting.
+ */
+ remove_all_break();
+
+ remcom_out_buffer[0] = 'S';
+ pack_hex_byte(&remcom_out_buffer[1], ks->signo);
+}
+
+/* Handle the 'g' get registers request */
+static void gdb_cmd_getregs(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ void *local_debuggerinfo;
+ int i;
+
+ thread = kgdb_usethread;
+ if (!thread) {
+ thread = kgdb_info[ks->cpu].task;
+ local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
+ } else {
+ local_debuggerinfo = NULL;
+ for (i = 0; i < NR_CPUS; i++) {
+ /*
+ * Try to find the task on some other
+ * or possibly this node if we do not
+ * find the matching task then we try
+ * to approximate the results.
+ */
+ if (thread == kgdb_info[i].task)
+ local_debuggerinfo = kgdb_info[i].debuggerinfo;
+ }
+ }
+
+ /*
+ * All threads that don't have debuggerinfo should be
+ * in __schedule() sleeping, since all other CPUs
+ * are in kgdb_wait, and thus have debuggerinfo.
+ */
+ if (local_debuggerinfo) {
+ pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
+ } else {
+ /*
+ * Pull stuff saved during switch_to; nothing
+ * else is accessible (or even particularly
+ * relevant).
+ *
+ * This should be enough for a stack trace.
+ */
+ sleeping_thread_to_gdb_regs(gdb_regs, thread);
+ }
+ kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
+}
+
+/* Handle the 'G' set registers request */
+static void gdb_cmd_setregs(struct kgdb_state *ks)
+{
+ kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
+
+ if (kgdb_usethread && kgdb_usethread != current) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ } else {
+ gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
+ strcpy(remcom_out_buffer, "OK");
+ }
+}
+
+/* Handle the 'm' memory read bytes */
+static void gdb_cmd_memread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ unsigned long length;
+ unsigned long addr;
+ int err;
+
+ if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
+ kgdb_hex2long(&ptr, &length) > 0) {
+ err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ } else {
+ error_packet(remcom_out_buffer, -EINVAL);
+ }
+}
+
+/* Handle the 'M' memory write bytes */
+static void gdb_cmd_memwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(0);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'X' memory binary write bytes */
+static void gdb_cmd_binwrite(struct kgdb_state *ks)
+{
+ int err = write_mem_msg(1);
+
+ if (err)
+ error_packet(remcom_out_buffer, err);
+ else
+ strcpy(remcom_out_buffer, "OK");
+}
+
+/* Handle the 'D' or 'k', detach or kill packets */
+static void gdb_cmd_detachkill(struct kgdb_state *ks)
+{
+ int error;
+
+ /* The detach case */
+ if (remcom_in_buffer[0] == 'D') {
+ error = remove_all_break();
+ if (error < 0) {
+ error_packet(remcom_out_buffer, error);
+ } else {
+ strcpy(remcom_out_buffer, "OK");
+ kgdb_connected = 0;
+ }
+ put_packet(remcom_out_buffer);
+ } else {
+ /*
+ * Assume the kill case, with no exit code checking,
+ * trying to force detach the debugger:
+ */
+ remove_all_break();
+ kgdb_connected = 0;
+ }
+}
+
+/* Handle the 'R' reboot packets */
+static int gdb_cmd_reboot(struct kgdb_state *ks)
+{
+ /* For now, only honor R0 */
+ if (strcmp(remcom_in_buffer, "R0") == 0) {
+ printk(KERN_CRIT "Executing emergency reboot\n");
+ strcpy(remcom_out_buffer, "OK");
+ put_packet(remcom_out_buffer);
+
+ /*
+ * Execution should not return from
+ * machine_emergency_restart()
+ */
+ machine_emergency_restart();
+ kgdb_connected = 0;
+
+ return 1;
+ }
+ return 0;
+}
+
+/* Handle the 'q' query packets */
+static void gdb_cmd_query(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ unsigned char thref[8];
+ char *ptr;
+ int i;
+
+ switch (remcom_in_buffer[1]) {
+ case 's':
+ case 'f':
+ if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+
+ if (remcom_in_buffer[1] == 'f')
+ ks->threadid = 1;
+
+ remcom_out_buffer[0] = 'm';
+ ptr = remcom_out_buffer + 1;
+
+ for (i = 0; i < 17; ks->threadid++) {
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (thread) {
+ int_to_threadref(thref, ks->threadid);
+ pack_threadid(ptr, thref);
+ ptr += BUF_THREAD_ID_SIZE;
+ *(ptr++) = ',';
+ i++;
+ }
+ }
+ *(--ptr) = '\0';
+ break;
+
+ case 'C':
+ /* Current thread id */
+ strcpy(remcom_out_buffer, "QC");
+ ks->threadid = shadow_pid(current->pid);
+ int_to_threadref(thref, ks->threadid);
+ pack_threadid(remcom_out_buffer + 2, thref);
+ break;
+ case 'T':
+ if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ ks->threadid = 0;
+ ptr = remcom_in_buffer + 17;
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!getthread(ks->linux_regs, ks->threadid)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ if (ks->threadid > 0) {
+ kgdb_mem2hex(getthread(ks->linux_regs,
+ ks->threadid)->comm,
+ remcom_out_buffer, 16);
+ } else {
+ static char tmpstr[23 + BUF_THREAD_ID_SIZE];
+
+ sprintf(tmpstr, "Shadow task %d for pid 0",
+ (int)(-ks->threadid-1));
+ kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
+ }
+ break;
+ }
+}
+
+/* Handle the 'H' task query packets */
+static void gdb_cmd_task(struct kgdb_state *ks)
+{
+ struct task_struct *thread;
+ char *ptr;
+
+ switch (remcom_in_buffer[1]) {
+ case 'g':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_usethread = thread;
+ ks->kgdb_usethreadid = ks->threadid;
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ case 'c':
+ ptr = &remcom_in_buffer[2];
+ kgdb_hex2long(&ptr, &ks->threadid);
+ if (!ks->threadid) {
+ kgdb_contthread = NULL;
+ } else {
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (!thread && ks->threadid > 0) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_contthread = thread;
+ }
+ strcpy(remcom_out_buffer, "OK");
+ break;
+ }
+}
+
+/* Handle the 'T' thread query packets */
+static void gdb_cmd_thread(struct kgdb_state *ks)
+{
+ char *ptr = &remcom_in_buffer[1];
+ struct task_struct *thread;
+
+ kgdb_hex2long(&ptr, &ks->threadid);
+ thread = getthread(ks->linux_regs, ks->threadid);
+ if (thread)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, -EINVAL);
+}
+
+/* Handle the 'z' or 'Z' breakpoint remove or set packets */
+static void gdb_cmd_break(struct kgdb_state *ks)
+{
+ /*
+ * Since GDB-5.3, it's been drafted that '0' is a software
+ * breakpoint, '1' is a hardware breakpoint, so let's do that.
+ */
+ char *bpt_type = &remcom_in_buffer[1];
+ char *ptr = &remcom_in_buffer[2];
+ unsigned long addr;
+ unsigned long length;
+ int error = 0;
+
+ if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
+ /* Unsupported */
+ if (*bpt_type > '4')
+ return;
+ } else {
+ if (*bpt_type != '0' && *bpt_type != '1')
+ /* Unsupported. */
+ return;
+ }
+
+ /*
+ * Test if this is a hardware breakpoint, and
+ * if we support it:
+ */
+ if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
+ /* Unsupported. */
+ return;
+
+ if (*(ptr++) != ',') {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (!kgdb_hex2long(&ptr, &addr)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+ if (*(ptr++) != ',' ||
+ !kgdb_hex2long(&ptr, &length)) {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return;
+ }
+
+ if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
+ error = kgdb_set_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
+ error = kgdb_remove_sw_break(addr);
+ else if (remcom_in_buffer[0] == 'Z')
+ error = arch_kgdb_ops.set_hw_breakpoint(addr,
+ (int)length, *bpt_type - '0');
+ else if (remcom_in_buffer[0] == 'z')
+ error = arch_kgdb_ops.remove_hw_breakpoint(addr,
+ (int) length, *bpt_type - '0');
+
+ if (error == 0)
+ strcpy(remcom_out_buffer, "OK");
+ else
+ error_packet(remcom_out_buffer, error);
+}
+
+/* Handle the 'C' signal / exception passing packets */
+static int gdb_cmd_exception_pass(struct kgdb_state *ks)
+{
+ /* C09 == pass exception
+ * C15 == detach kgdb, pass exception
+ */
+ if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'c';
+
+ } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
+
+ ks->pass_exception = 1;
+ remcom_in_buffer[0] = 'D';
+ remove_all_break();
+ kgdb_connected = 0;
+ return 1;
+
+ } else {
+ error_packet(remcom_out_buffer, -EINVAL);
+ return 0;
+ }
+
+ /* Indicate fall through */
+ return -1;
+}
+
+/*
+ * This function performs all gdbserial command procesing
+ */
+static int gdb_serial_stub(struct kgdb_state *ks)
+{
+ int error = 0;
+ int tmp;
+
+ /* Clear the out buffer. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+ if (kgdb_connected) {
+ unsigned char thref[8];
+ char *ptr;
+
+ /* Reply to host that an exception has occurred */
+ ptr = remcom_out_buffer;
+ *ptr++ = 'T';
+ ptr = pack_hex_byte(ptr, ks->signo);
+ ptr += strlen(strcpy(ptr, "thread:"));
+ int_to_threadref(thref, shadow_pid(current->pid));
+ ptr = pack_threadid(ptr, thref);
+ *ptr++ = ';';
+ put_packet(remcom_out_buffer);
+ }
+
+ kgdb_usethread = kgdb_info[ks->cpu].task;
+ ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
+ ks->pass_exception = 0;
+
+ while (1) {
+ error = 0;
+
+ /* Clear the out buffer. */
+ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
+
+ get_packet(remcom_in_buffer);
+
+ switch (remcom_in_buffer[0]) {
+ case '?': /* gdbserial status */
+ gdb_cmd_status(ks);
+ break;
+ case 'g': /* return the value of the CPU registers */
+ gdb_cmd_getregs(ks);
+ break;
+ case 'G': /* set the value of the CPU registers - return OK */
+ gdb_cmd_setregs(ks);
+ break;
+ case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
+ gdb_cmd_memread(ks);
+ break;
+ case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_memwrite(ks);
+ break;
+ case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
+ gdb_cmd_binwrite(ks);
+ break;
+ /* kill or detach. KGDB should treat this like a
+ * continue.
+ */
+ case 'D': /* Debugger detach */
+ case 'k': /* Debugger detach via kill */
+ gdb_cmd_detachkill(ks);
+ goto default_handle;
+ case 'R': /* Reboot */
+ if (gdb_cmd_reboot(ks))
+ goto default_handle;
+ break;
+ case 'q': /* query command */
+ gdb_cmd_query(ks);
+ break;
+ case 'H': /* task related */
+ gdb_cmd_task(ks);
+ break;
+ case 'T': /* Query thread status */
+ gdb_cmd_thread(ks);
+ break;
+ case 'z': /* Break point remove */
+ case 'Z': /* Break point set */
+ gdb_cmd_break(ks);
+ break;
+ case 'C': /* Exception passing */
+ tmp = gdb_cmd_exception_pass(ks);
+ if (tmp > 0)
+ goto default_handle;
+ if (tmp == 0)
+ break;
+ /* Fall through on tmp < 0 */
+ case 'c': /* Continue packet */
+ case 's': /* Single step packet */
+ if (kgdb_contthread && kgdb_contthread != current) {
+ /* Can't switch threads in kgdb */
+ error_packet(remcom_out_buffer, -EINVAL);
+ break;
+ }
+ kgdb_activate_sw_breakpoints();
+ /* Fall through to default processing */
+ default:
+default_handle:
+ error = kgdb_arch_handle_exception(ks->ex_vector,
+ ks->signo,
+ ks->err_code,
+ remcom_in_buffer,
+ remcom_out_buffer,
+ ks->linux_regs);
+ /*
+ * Leave cmd processing on error, detach,
+ * kill, continue, or single step.
+ */
+ if (error >= 0 || remcom_in_buffer[0] == 'D' ||
+ remcom_in_buffer[0] == 'k') {
+ error = 0;
+ goto kgdb_exit;
+ }
+
+ }
+
+ /* reply to the request */
+ put_packet(remcom_out_buffer);
+ }
+
+kgdb_exit:
+ if (ks->pass_exception)
+ error = 1;
+ return error;
+}
+
+static int kgdb_reenter_check(struct kgdb_state *ks)
+{
+ unsigned long addr;
+
+ if (atomic_read(&kgdb_active) != raw_smp_processor_id())
+ return 0;
+
+ /* Panic on recursive debugger calls: */
+ exception_level++;
+ addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
+ kgdb_deactivate_sw_breakpoints();
+
+ /*
+ * If the break point removed ok at the place exception
+ * occurred, try to recover and print a warning to the end
+ * user because the user planted a breakpoint in a place that
+ * KGDB needs in order to function.
+ */
+ if (kgdb_remove_sw_break(addr) == 0) {
+ exception_level = 0;
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+ kgdb_activate_sw_breakpoints();
+ printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
+ addr);
+ WARN_ON_ONCE(1);
+
+ return 1;
+ }
+ remove_all_break();
+ kgdb_skipexception(ks->ex_vector, ks->linux_regs);
+
+ if (exception_level > 1) {
+ dump_stack();
+ panic("Recursive entry to debugger");
+ }
+
+ printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
+ dump_stack();
+ panic("Recursive entry to debugger");
+
+ return 1;
+}
+
+/*
+ * kgdb_handle_exception() - main entry point from a kernel exception
+ *
+ * Locking hierarchy:
+ * interface locks, if any (begin_session)
+ * kgdb lock (kgdb_active)
+ */
+int
+kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
+{
+ struct kgdb_state kgdb_var;
+ struct kgdb_state *ks = &kgdb_var;
+ unsigned long flags;
+ int error = 0;
+ int i, cpu;
+
+ ks->cpu = raw_smp_processor_id();
+ ks->ex_vector = evector;
+ ks->signo = signo;
+ ks->ex_vector = evector;
+ ks->err_code = ecode;
+ ks->kgdb_usethreadid = 0;
+ ks->linux_regs = regs;
+
+ if (kgdb_reenter_check(ks))
+ return 0; /* Ouch, double exception ! */
+
+acquirelock:
+ /*
+ * Interrupts will be restored by the 'trap return' code, except when
+ * single stepping.
+ */
+ local_irq_save(flags);
+
+ cpu = raw_smp_processor_id();
+
+ /*
+ * Acquire the kgdb_active lock:
+ */
+ while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
+ cpu_relax();
+
+ /*
+ * Do not start the debugger connection on this CPU if the last
+ * instance of the exception handler wanted to come into the
+ * debugger on a different CPU via a single step
+ */
+ if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
+ atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
+
+ atomic_set(&kgdb_active, -1);
+ clocksource_touch_watchdog();
+ local_irq_restore(flags);
+
+ goto acquirelock;
+ }
+
+ if (!kgdb_io_ready(1)) {
+ error = 1;
+ goto kgdb_restore; /* No I/O connection, so resume the system */
+ }
+
+ /*
+ * Don't enter if we have hit a removed breakpoint.
+ */
+ if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
+ goto kgdb_restore;
+
+ /* Call the I/O driver's pre_exception routine */
+ if (kgdb_io_ops->pre_exception)
+ kgdb_io_ops->pre_exception();
+
+ kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
+ kgdb_info[ks->cpu].task = current;
+
+ kgdb_disable_hw_debug(ks->linux_regs);
+
+ /*
+ * Get the passive CPU lock which will hold all the non-primary
+ * CPU in a spin state while the debugger is active
+ */
+ if (!kgdb_single_step || !kgdb_contthread) {
+ for (i = 0; i < NR_CPUS; i++)
+ atomic_set(&passive_cpu_wait[i], 1);
+ }
+
+ /*
+ * spin_lock code is good enough as a barrier so we don't
+ * need one here:
+ */
+ atomic_set(&cpu_in_kgdb[ks->cpu], 1);
+
+#ifdef CONFIG_SMP
+ /* Signal the other CPUs to enter kgdb_wait() */
+ if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
+ kgdb_roundup_cpus(flags);
+#endif
+
+ /*
+ * Wait for the other CPUs to be notified and be waiting for us:
+ */
+ for_each_online_cpu(i) {
+ while (!atomic_read(&cpu_in_kgdb[i]))
+ cpu_relax();
+ }
+
+ /*
+ * At this point the primary processor is completely
+ * in the debugger and all secondary CPUs are quiescent
+ */
+ kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
+ kgdb_deactivate_sw_breakpoints();
+ kgdb_single_step = 0;
+ kgdb_contthread = NULL;
+ exception_level = 0;
+
+ /* Talk to debugger with gdbserial protocol */
+ error = gdb_serial_stub(ks);
+
+ /* Call the I/O driver's post_exception routine */
+ if (kgdb_io_ops->post_exception)
+ kgdb_io_ops->post_exception();
+
+ kgdb_info[ks->cpu].debuggerinfo = NULL;
+ kgdb_info[ks->cpu].task = NULL;
+ atomic_set(&cpu_in_kgdb[ks->cpu], 0);
+
+ if (!kgdb_single_step || !kgdb_contthread) {
+ for (i = NR_CPUS-1; i >= 0; i--)
+ atomic_set(&passive_cpu_wait[i], 0);
+ /*
+ * Wait till all the CPUs have quit
+ * from the debugger.
+ */
+ for_each_online_cpu(i) {
+ while (atomic_read(&cpu_in_kgdb[i]))
+ cpu_relax();
+ }
+ }
+
+kgdb_restore:
+ /* Free kgdb_active */
+ atomic_set(&kgdb_active, -1);
+ clocksource_touch_watchdog();
+ local_irq_restore(flags);
+
+ return error;
+}
+
+int kgdb_nmicallback(int cpu, void *regs)
+{
+#ifdef CONFIG_SMP
+ if (!atomic_read(&cpu_in_kgdb[cpu]) &&
+ atomic_read(&kgdb_active) != cpu &&
+ atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
+ kgdb_wait((struct pt_regs *)regs);
+ return 0;
+ }
+#endif
+ return 1;
+}
+
+void kgdb_console_write(struct console *co, const char *s, unsigned count)
+{
+ unsigned long flags;
+
+ /* If we're debugging, or KGDB has not connected, don't try
+ * and print. */
+ if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
+ return;
+
+ local_irq_save(flags);
+ kgdb_msg_write(s, count);
+ local_irq_restore(flags);
+}
+
+static struct console kgdbcons = {
+ .name = "kgdb",
+ .write = kgdb_console_write,
+ .flags = CON_PRINTBUFFER | CON_ENABLED,
+ .index = -1,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static void sysrq_handle_gdb(int key, struct tty_struct *tty)
+{
+ if (!kgdb_io_ops) {
+ printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
+ return;
+ }
+ if (!kgdb_connected)
+ printk(KERN_CRIT "Entering KGDB\n");
+
+ kgdb_breakpoint();
+}
+
+static struct sysrq_key_op sysrq_gdb_op = {
+ .handler = sysrq_handle_gdb,
+ .help_msg = "Gdb",
+ .action_msg = "GDB",
+};
+#endif
+
+static void kgdb_register_callbacks(void)
+{
+ if (!kgdb_io_module_registered) {
+ kgdb_io_module_registered = 1;
+ kgdb_arch_init();
+#ifdef CONFIG_MAGIC_SYSRQ
+ register_sysrq_key('g', &sysrq_gdb_op);
+#endif
+ if (kgdb_use_con && !kgdb_con_registered) {
+ register_console(&kgdbcons);
+ kgdb_con_registered = 1;
+ }
+ }
+}
+
+static void kgdb_unregister_callbacks(void)
+{
+ /*
+ * When this routine is called KGDB should unregister from the
+ * panic handler and clean up, making sure it is not handling any
+ * break exceptions at the time.
+ */
+ if (kgdb_io_module_registered) {
+ kgdb_io_module_registered = 0;
+ kgdb_arch_exit();
+#ifdef CONFIG_MAGIC_SYSRQ
+ unregister_sysrq_key('g', &sysrq_gdb_op);
+#endif
+ if (kgdb_con_registered) {
+ unregister_console(&kgdbcons);
+ kgdb_con_registered = 0;
+ }
+ }
+}
+
+static void kgdb_initial_breakpoint(void)
+{
+ kgdb_break_asap = 0;
+
+ printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
+ kgdb_breakpoint();
+}
+
+/**
+ * kgdb_register_io_module - register KGDB IO module
+ * @new_kgdb_io_ops: the io ops vector
+ *
+ * Register it with the KGDB core.
+ */
+int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
+{
+ int err;
+
+ spin_lock(&kgdb_registration_lock);
+
+ if (kgdb_io_ops) {
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_ERR "kgdb: Another I/O driver is already "
+ "registered with KGDB.\n");
+ return -EBUSY;
+ }
+
+ if (new_kgdb_io_ops->init) {
+ err = new_kgdb_io_ops->init();
+ if (err) {
+ spin_unlock(&kgdb_registration_lock);
+ return err;
+ }
+ }
+
+ kgdb_io_ops = new_kgdb_io_ops;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
+ new_kgdb_io_ops->name);
+
+ /* Arm KGDB now. */
+ kgdb_register_callbacks();
+
+ if (kgdb_break_asap)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kgdb_register_io_module);
+
+/**
+ * kkgdb_unregister_io_module - unregister KGDB IO module
+ * @old_kgdb_io_ops: the io ops vector
+ *
+ * Unregister it with the KGDB core.
+ */
+void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
+{
+ BUG_ON(kgdb_connected);
+
+ /*
+ * KGDB is no longer able to communicate out, so
+ * unregister our callbacks and reset state.
+ */
+ kgdb_unregister_callbacks();
+
+ spin_lock(&kgdb_registration_lock);
+
+ WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
+ kgdb_io_ops = NULL;
+
+ spin_unlock(&kgdb_registration_lock);
+
+ printk(KERN_INFO
+ "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
+ old_kgdb_io_ops->name);
+}
+EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
+
+/**
+ * kgdb_breakpoint - generate breakpoint exception
+ *
+ * This function will generate a breakpoint exception. It is used at the
+ * beginning of a program to sync up with a debugger and can be used
+ * otherwise as a quick means to stop program execution and "break" into
+ * the debugger.
+ */
+void kgdb_breakpoint(void)
+{
+ atomic_set(&kgdb_setting_breakpoint, 1);
+ wmb(); /* Sync point before breakpoint */
+ arch_kgdb_breakpoint();
+ wmb(); /* Sync point after breakpoint */
+ atomic_set(&kgdb_setting_breakpoint, 0);
+}
+EXPORT_SYMBOL_GPL(kgdb_breakpoint);
+
+static int __init opt_kgdb_wait(char *str)
+{
+ kgdb_break_asap = 1;
+
+ if (kgdb_io_module_registered)
+ kgdb_initial_breakpoint();
+
+ return 0;
+}
+
+early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bb7df2a28bd7..22be3ff3f363 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -173,10 +173,7 @@ static int ____call_usermodehelper(void *data)
*/
set_user_nice(current, 0);
- retval = -EPERM;
- if (current->fs->root)
- retval = kernel_execve(sub_info->path,
- sub_info->argv, sub_info->envp);
+ retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
/* Exec failed? */
sub_info->retval = retval;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7a86e6432338..fcfb580c3afc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -498,27 +498,36 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
return 0;
}
+/*
+ * If we have a symbol_name argument, look it up and add the offset field
+ * to it. This way, we can specify a relative address to a symbol.
+ */
+static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
+{
+ kprobe_opcode_t *addr = p->addr;
+ if (p->symbol_name) {
+ if (addr)
+ return NULL;
+ kprobe_lookup_name(p->symbol_name, addr);
+ }
+
+ if (!addr)
+ return NULL;
+ return (kprobe_opcode_t *)(((char *)addr) + p->offset);
+}
+
static int __kprobes __register_kprobe(struct kprobe *p,
unsigned long called_from)
{
int ret = 0;
struct kprobe *old_p;
struct module *probed_mod;
+ kprobe_opcode_t *addr;
- /*
- * If we have a symbol_name argument look it up,
- * and add it to the address. That way the addr
- * field can either be global or relative to a symbol.
- */
- if (p->symbol_name) {
- if (p->addr)
- return -EINVAL;
- kprobe_lookup_name(p->symbol_name, p->addr);
- }
-
- if (!p->addr)
+ addr = kprobe_addr(p);
+ if (!addr)
return -EINVAL;
- p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
+ p->addr = addr;
if (!kernel_text_address((unsigned long) p->addr) ||
in_kprobes_functions((unsigned long) p->addr))
@@ -678,8 +687,7 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
unregister_kprobe(&jp->kp);
}
-#ifdef ARCH_SUPPORTS_KRETPROBES
-
+#ifdef CONFIG_KRETPROBES
/*
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
@@ -722,12 +730,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
int ret = 0;
struct kretprobe_instance *inst;
int i;
- void *addr = rp->kp.addr;
+ void *addr;
if (kretprobe_blacklist_size) {
- if (addr == NULL)
- kprobe_lookup_name(rp->kp.symbol_name, addr);
- addr += rp->kp.offset;
+ addr = kprobe_addr(&rp->kp);
+ if (!addr)
+ return -EINVAL;
for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
if (kretprobe_blacklist[i].addr == addr)
@@ -769,8 +777,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
return ret;
}
-#else /* ARCH_SUPPORTS_KRETPROBES */
-
+#else /* CONFIG_KRETPROBES */
int __kprobes register_kretprobe(struct kretprobe *rp)
{
return -ENOSYS;
@@ -781,8 +788,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
{
return 0;
}
-
-#endif /* ARCH_SUPPORTS_KRETPROBES */
+#endif /* CONFIG_KRETPROBES */
void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3574379f4d62..81a4e4a3f087 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -779,6 +779,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* parallel walking of the hash-list safe:
*/
list_add_tail_rcu(&class->hash_entry, hash_head);
+ /*
+ * Add it to the global list of classes:
+ */
+ list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
if (verbose(class)) {
graph_unlock();
@@ -2282,10 +2286,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return 0;
break;
case LOCK_USED:
- /*
- * Add it to the global list of classes:
- */
- list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
debug_atomic_dec(&nr_unused_locks);
break;
default:
diff --git a/kernel/marker.c b/kernel/marker.c
index 5323cfaedbce..005b95954593 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -27,35 +27,42 @@
extern struct marker __start___markers[];
extern struct marker __stop___markers[];
+/* Set to 1 to enable marker debug output */
+const int marker_debug;
+
/*
* markers_mutex nests inside module_mutex. Markers mutex protects the builtin
- * and module markers, the hash table and deferred_sync.
+ * and module markers and the hash table.
*/
static DEFINE_MUTEX(markers_mutex);
/*
- * Marker deferred synchronization.
- * Upon marker probe_unregister, we delay call to synchronize_sched() to
- * accelerate mass unregistration (only when there is no more reference to a
- * given module do we call synchronize_sched()). However, we need to make sure
- * every critical region has ended before we re-arm a marker that has been
- * unregistered and then registered back with a different probe data.
- */
-static int deferred_sync;
-
-/*
* Marker hash table, containing the active markers.
* Protected by module_mutex.
*/
#define MARKER_HASH_BITS 6
#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+/*
+ * Note about RCU :
+ * It is used to make sure every handler has finished using its private data
+ * between two consecutive operation (add or remove) on a given marker. It is
+ * also used to delay the free of multiple probes array until a quiescent state
+ * is reached.
+ * marker entries modifications are protected by the markers_mutex.
+ */
struct marker_entry {
struct hlist_node hlist;
char *format;
- marker_probe_func *probe;
- void *private;
+ void (*call)(const struct marker *mdata, /* Probe wrapper */
+ void *call_private, const char *fmt, ...);
+ struct marker_probe_closure single;
+ struct marker_probe_closure *multi;
int refcount; /* Number of times armed. 0 if disarmed. */
+ struct rcu_head rcu;
+ void *oldptr;
+ unsigned char rcu_pending:1;
+ unsigned char ptype:1;
char name[0]; /* Contains name'\0'format'\0' */
};
@@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
/**
* __mark_empty_function - Empty probe callback
- * @mdata: pointer of type const struct marker
+ * @probe_private: probe private data
+ * @call_private: call site private data
* @fmt: format string
* @...: variable argument list
*
@@ -72,13 +80,267 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
* though the function pointer change and the marker enabling are two distinct
* operations that modifies the execution flow of preemptible code.
*/
-void __mark_empty_function(const struct marker *mdata, void *private,
- const char *fmt, ...)
+void __mark_empty_function(void *probe_private, void *call_private,
+ const char *fmt, va_list *args)
{
}
EXPORT_SYMBOL_GPL(__mark_empty_function);
/*
+ * marker_probe_cb Callback that prepares the variable argument list for probes.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...: Variable argument list.
+ *
+ * Since we do not use "typical" pointer based RCU in the 1 argument case, we
+ * need to put a full smp_rmb() in this branch. This is why we do not use
+ * rcu_dereference() for the pointer read.
+ */
+void marker_probe_cb(const struct marker *mdata, void *call_private,
+ const char *fmt, ...)
+{
+ va_list args;
+ char ptype;
+
+ /*
+ * preempt_disable does two things : disabling preemption to make sure
+ * the teardown of the callbacks can be done correctly when they are in
+ * modules and they insure RCU read coherency.
+ */
+ preempt_disable();
+ ptype = mdata->ptype;
+ if (likely(!ptype)) {
+ marker_probe_func *func;
+ /* Must read the ptype before ptr. They are not data dependant,
+ * so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func = mdata->single.func;
+ /* Must read the ptr before private data. They are not data
+ * dependant, so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ va_start(args, fmt);
+ func(mdata->single.probe_private, call_private, fmt, &args);
+ va_end(args);
+ } else {
+ struct marker_probe_closure *multi;
+ int i;
+ /*
+ * multi points to an array, therefore accessing the array
+ * depends on reading multi. However, even in this case,
+ * we must insure that the pointer is read _before_ the array
+ * data. Same as rcu_dereference, but we need a full smp_rmb()
+ * in the fast path, so put the explicit barrier here.
+ */
+ smp_read_barrier_depends();
+ multi = mdata->multi;
+ for (i = 0; multi[i].func; i++) {
+ va_start(args, fmt);
+ multi[i].func(multi[i].probe_private, call_private, fmt,
+ &args);
+ va_end(args);
+ }
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb);
+
+/*
+ * marker_probe_cb Callback that does not prepare the variable argument list.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @fmt: format string
+ * @...: Variable argument list.
+ *
+ * Should be connected to markers "MARK_NOARGS".
+ */
+void marker_probe_cb_noarg(const struct marker *mdata,
+ void *call_private, const char *fmt, ...)
+{
+ va_list args; /* not initialized */
+ char ptype;
+
+ preempt_disable();
+ ptype = mdata->ptype;
+ if (likely(!ptype)) {
+ marker_probe_func *func;
+ /* Must read the ptype before ptr. They are not data dependant,
+ * so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func = mdata->single.func;
+ /* Must read the ptr before private data. They are not data
+ * dependant, so we put an explicit smp_rmb() here. */
+ smp_rmb();
+ func(mdata->single.probe_private, call_private, fmt, &args);
+ } else {
+ struct marker_probe_closure *multi;
+ int i;
+ /*
+ * multi points to an array, therefore accessing the array
+ * depends on reading multi. However, even in this case,
+ * we must insure that the pointer is read _before_ the array
+ * data. Same as rcu_dereference, but we need a full smp_rmb()
+ * in the fast path, so put the explicit barrier here.
+ */
+ smp_read_barrier_depends();
+ multi = mdata->multi;
+ for (i = 0; multi[i].func; i++)
+ multi[i].func(multi[i].probe_private, call_private, fmt,
+ &args);
+ }
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
+
+static void free_old_closure(struct rcu_head *head)
+{
+ struct marker_entry *entry = container_of(head,
+ struct marker_entry, rcu);
+ kfree(entry->oldptr);
+ /* Make sure we free the data before setting the pending flag to 0 */
+ smp_wmb();
+ entry->rcu_pending = 0;
+}
+
+static void debug_print_probes(struct marker_entry *entry)
+{
+ int i;
+
+ if (!marker_debug)
+ return;
+
+ if (!entry->ptype) {
+ printk(KERN_DEBUG "Single probe : %p %p\n",
+ entry->single.func,
+ entry->single.probe_private);
+ } else {
+ for (i = 0; entry->multi[i].func; i++)
+ printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
+ entry->multi[i].func,
+ entry->multi[i].probe_private);
+ }
+}
+
+static struct marker_probe_closure *
+marker_entry_add_probe(struct marker_entry *entry,
+ marker_probe_func *probe, void *probe_private)
+{
+ int nr_probes = 0;
+ struct marker_probe_closure *old, *new;
+
+ WARN_ON(!probe);
+
+ debug_print_probes(entry);
+ old = entry->multi;
+ if (!entry->ptype) {
+ if (entry->single.func == probe &&
+ entry->single.probe_private == probe_private)
+ return ERR_PTR(-EBUSY);
+ if (entry->single.func == __mark_empty_function) {
+ /* 0 -> 1 probes */
+ entry->single.func = probe;
+ entry->single.probe_private = probe_private;
+ entry->refcount = 1;
+ entry->ptype = 0;
+ debug_print_probes(entry);
+ return NULL;
+ } else {
+ /* 1 -> 2 probes */
+ nr_probes = 1;
+ old = NULL;
+ }
+ } else {
+ /* (N -> N+1), (N != 0, 1) probes */
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+ if (old[nr_probes].func == probe
+ && old[nr_probes].probe_private
+ == probe_private)
+ return ERR_PTR(-EBUSY);
+ }
+ /* + 2 : one for new probe, one for NULL func */
+ new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
+ GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ if (!old)
+ new[0] = entry->single;
+ else
+ memcpy(new, old,
+ nr_probes * sizeof(struct marker_probe_closure));
+ new[nr_probes].func = probe;
+ new[nr_probes].probe_private = probe_private;
+ entry->refcount = nr_probes + 1;
+ entry->multi = new;
+ entry->ptype = 1;
+ debug_print_probes(entry);
+ return old;
+}
+
+static struct marker_probe_closure *
+marker_entry_remove_probe(struct marker_entry *entry,
+ marker_probe_func *probe, void *probe_private)
+{
+ int nr_probes = 0, nr_del = 0, i;
+ struct marker_probe_closure *old, *new;
+
+ old = entry->multi;
+
+ debug_print_probes(entry);
+ if (!entry->ptype) {
+ /* 0 -> N is an error */
+ WARN_ON(entry->single.func == __mark_empty_function);
+ /* 1 -> 0 probes */
+ WARN_ON(probe && entry->single.func != probe);
+ WARN_ON(entry->single.probe_private != probe_private);
+ entry->single.func = __mark_empty_function;
+ entry->refcount = 0;
+ entry->ptype = 0;
+ debug_print_probes(entry);
+ return NULL;
+ } else {
+ /* (N -> M), (N > 1, M >= 0) probes */
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ if ((!probe || old[nr_probes].func == probe)
+ && old[nr_probes].probe_private
+ == probe_private)
+ nr_del++;
+ }
+ }
+
+ if (nr_probes - nr_del == 0) {
+ /* N -> 0, (N > 1) */
+ entry->single.func = __mark_empty_function;
+ entry->refcount = 0;
+ entry->ptype = 0;
+ } else if (nr_probes - nr_del == 1) {
+ /* N -> 1, (N > 1) */
+ for (i = 0; old[i].func; i++)
+ if ((probe && old[i].func != probe) ||
+ old[i].probe_private != probe_private)
+ entry->single = old[i];
+ entry->refcount = 1;
+ entry->ptype = 0;
+ } else {
+ int j = 0;
+ /* N -> M, (N > 1, M > 1) */
+ /* + 1 for NULL */
+ new = kzalloc((nr_probes - nr_del + 1)
+ * sizeof(struct marker_probe_closure), GFP_KERNEL);
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ for (i = 0; old[i].func; i++)
+ if ((probe && old[i].func != probe) ||
+ old[i].probe_private != probe_private)
+ new[j++] = old[i];
+ entry->refcount = nr_probes - nr_del;
+ entry->ptype = 1;
+ entry->multi = new;
+ }
+ debug_print_probes(entry);
+ return old;
+}
+
+/*
* Get marker if the marker is present in the marker hash table.
* Must be called with markers_mutex held.
* Returns NULL if not present.
@@ -102,8 +364,7 @@ static struct marker_entry *get_marker(const char *name)
* Add the marker to the marker hash table. Must be called with markers_mutex
* held.
*/
-static int add_marker(const char *name, const char *format,
- marker_probe_func *probe, void *private)
+static struct marker_entry *add_marker(const char *name, const char *format)
{
struct hlist_head *head;
struct hlist_node *node;
@@ -118,9 +379,8 @@ static int add_marker(const char *name, const char *format,
hlist_for_each_entry(e, node, head, hlist) {
if (!strcmp(name, e->name)) {
printk(KERN_NOTICE
- "Marker %s busy, probe %p already installed\n",
- name, e->probe);
- return -EBUSY; /* Already there */
+ "Marker %s busy\n", name);
+ return ERR_PTR(-EBUSY); /* Already there */
}
}
/*
@@ -130,34 +390,42 @@ static int add_marker(const char *name, const char *format,
e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
GFP_KERNEL);
if (!e)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
memcpy(&e->name[0], name, name_len);
if (format) {
e->format = &e->name[name_len];
memcpy(e->format, format, format_len);
+ if (strcmp(e->format, MARK_NOARGS) == 0)
+ e->call = marker_probe_cb_noarg;
+ else
+ e->call = marker_probe_cb;
trace_mark(core_marker_format, "name %s format %s",
e->name, e->format);
- } else
+ } else {
e->format = NULL;
- e->probe = probe;
- e->private = private;
+ e->call = marker_probe_cb;
+ }
+ e->single.func = __mark_empty_function;
+ e->single.probe_private = NULL;
+ e->multi = NULL;
+ e->ptype = 0;
e->refcount = 0;
+ e->rcu_pending = 0;
hlist_add_head(&e->hlist, head);
- return 0;
+ return e;
}
/*
* Remove the marker from the marker hash table. Must be called with mutex_lock
* held.
*/
-static void *remove_marker(const char *name)
+static int remove_marker(const char *name)
{
struct hlist_head *head;
struct hlist_node *node;
struct marker_entry *e;
int found = 0;
size_t len = strlen(name) + 1;
- void *private = NULL;
u32 hash = jhash(name, len-1, 0);
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
@@ -167,12 +435,16 @@ static void *remove_marker(const char *name)
break;
}
}
- if (found) {
- private = e->private;
- hlist_del(&e->hlist);
- kfree(e);
- }
- return private;
+ if (!found)
+ return -ENOENT;
+ if (e->single.func != __mark_empty_function)
+ return -EBUSY;
+ hlist_del(&e->hlist);
+ /* Make sure the call_rcu has been executed */
+ if (e->rcu_pending)
+ rcu_barrier();
+ kfree(e);
+ return 0;
}
/*
@@ -184,6 +456,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
size_t name_len = strlen((*entry)->name) + 1;
size_t format_len = strlen(format) + 1;
+
e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
GFP_KERNEL);
if (!e)
@@ -191,11 +464,20 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
memcpy(&e->name[0], (*entry)->name, name_len);
e->format = &e->name[name_len];
memcpy(e->format, format, format_len);
- e->probe = (*entry)->probe;
- e->private = (*entry)->private;
+ if (strcmp(e->format, MARK_NOARGS) == 0)
+ e->call = marker_probe_cb_noarg;
+ else
+ e->call = marker_probe_cb;
+ e->single = (*entry)->single;
+ e->multi = (*entry)->multi;
+ e->ptype = (*entry)->ptype;
e->refcount = (*entry)->refcount;
+ e->rcu_pending = 0;
hlist_add_before(&e->hlist, &(*entry)->hlist);
hlist_del(&(*entry)->hlist);
+ /* Make sure the call_rcu has been executed */
+ if ((*entry)->rcu_pending)
+ rcu_barrier();
kfree(*entry);
*entry = e;
trace_mark(core_marker_format, "name %s format %s",
@@ -206,7 +488,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
/*
* Sets the probe callback corresponding to one marker.
*/
-static int set_marker(struct marker_entry **entry, struct marker *elem)
+static int set_marker(struct marker_entry **entry, struct marker *elem,
+ int active)
{
int ret;
WARN_ON(strcmp((*entry)->name, elem->name) != 0);
@@ -226,26 +509,64 @@ static int set_marker(struct marker_entry **entry, struct marker *elem)
if (ret)
return ret;
}
- elem->call = (*entry)->probe;
- elem->private = (*entry)->private;
- elem->state = 1;
+
+ /*
+ * probe_cb setup (statically known) is done here. It is
+ * asynchronous with the rest of execution, therefore we only
+ * pass from a "safe" callback (with argument) to an "unsafe"
+ * callback (does not set arguments).
+ */
+ elem->call = (*entry)->call;
+ /*
+ * Sanity check :
+ * We only update the single probe private data when the ptr is
+ * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
+ */
+ WARN_ON(elem->single.func != __mark_empty_function
+ && elem->single.probe_private
+ != (*entry)->single.probe_private &&
+ !elem->ptype);
+ elem->single.probe_private = (*entry)->single.probe_private;
+ /*
+ * Make sure the private data is valid when we update the
+ * single probe ptr.
+ */
+ smp_wmb();
+ elem->single.func = (*entry)->single.func;
+ /*
+ * We also make sure that the new probe callbacks array is consistent
+ * before setting a pointer to it.
+ */
+ rcu_assign_pointer(elem->multi, (*entry)->multi);
+ /*
+ * Update the function or multi probe array pointer before setting the
+ * ptype.
+ */
+ smp_wmb();
+ elem->ptype = (*entry)->ptype;
+ elem->state = active;
+
return 0;
}
/*
* Disable a marker and its probe callback.
- * Note: only after a synchronize_sched() issued after setting elem->call to the
- * empty function insures that the original callback is not used anymore. This
- * insured by preemption disabling around the call site.
+ * Note: only waiting an RCU period after setting elem->call to the empty
+ * function insures that the original callback is not used anymore. This insured
+ * by preempt_disable around the call site.
*/
static void disable_marker(struct marker *elem)
{
+ /* leave "call" as is. It is known statically. */
elem->state = 0;
- elem->call = __mark_empty_function;
+ elem->single.func = __mark_empty_function;
+ /* Update the function before setting the ptype */
+ smp_wmb();
+ elem->ptype = 0; /* single probe */
/*
* Leave the private data and id there, because removal is racy and
- * should be done only after a synchronize_sched(). These are never used
- * until the next initialization anyway.
+ * should be done only after an RCU period. These are never used until
+ * the next initialization anyway.
*/
}
@@ -253,14 +574,11 @@ static void disable_marker(struct marker *elem)
* marker_update_probe_range - Update a probe range
* @begin: beginning of the range
* @end: end of the range
- * @probe_module: module address of the probe being updated
- * @refcount: number of references left to the given probe_module (out)
*
* Updates the probe callback corresponding to a range of markers.
*/
void marker_update_probe_range(struct marker *begin,
- struct marker *end, struct module *probe_module,
- int *refcount)
+ struct marker *end)
{
struct marker *iter;
struct marker_entry *mark_entry;
@@ -268,15 +586,12 @@ void marker_update_probe_range(struct marker *begin,
mutex_lock(&markers_mutex);
for (iter = begin; iter < end; iter++) {
mark_entry = get_marker(iter->name);
- if (mark_entry && mark_entry->refcount) {
- set_marker(&mark_entry, iter);
+ if (mark_entry) {
+ set_marker(&mark_entry, iter,
+ !!mark_entry->refcount);
/*
* ignore error, continue
*/
- if (probe_module)
- if (probe_module ==
- __module_text_address((unsigned long)mark_entry->probe))
- (*refcount)++;
} else {
disable_marker(iter);
}
@@ -286,23 +601,27 @@ void marker_update_probe_range(struct marker *begin,
/*
* Update probes, removing the faulty probes.
- * Issues a synchronize_sched() when no reference to the module passed
- * as parameter is found in the probes so the probe module can be
- * safely unloaded from now on.
+ *
+ * Internal callback only changed before the first probe is connected to it.
+ * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
+ * transitions. All other transitions will leave the old private data valid.
+ * This makes the non-atomicity of the callback/private data updates valid.
+ *
+ * "special case" updates :
+ * 0 -> 1 callback
+ * 1 -> 0 callback
+ * 1 -> 2 callbacks
+ * 2 -> 1 callbacks
+ * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
+ * Site effect : marker_set_format may delete the marker entry (creating a
+ * replacement).
*/
-static void marker_update_probes(struct module *probe_module)
+static void marker_update_probes(void)
{
- int refcount = 0;
-
/* Core kernel markers */
- marker_update_probe_range(__start___markers,
- __stop___markers, probe_module, &refcount);
+ marker_update_probe_range(__start___markers, __stop___markers);
/* Markers in modules. */
- module_update_markers(probe_module, &refcount);
- if (probe_module && refcount == 0) {
- synchronize_sched();
- deferred_sync = 0;
- }
+ module_update_markers();
}
/**
@@ -310,33 +629,52 @@ static void marker_update_probes(struct module *probe_module)
* @name: marker name
* @format: format string
* @probe: probe handler
- * @private: probe private data
+ * @probe_private: probe private data
*
* private data must be a valid allocated memory address, or NULL.
* Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
*/
int marker_probe_register(const char *name, const char *format,
- marker_probe_func *probe, void *private)
+ marker_probe_func *probe, void *probe_private)
{
struct marker_entry *entry;
int ret = 0;
+ struct marker_probe_closure *old;
mutex_lock(&markers_mutex);
entry = get_marker(name);
- if (entry && entry->refcount) {
- ret = -EBUSY;
- goto end;
- }
- if (deferred_sync) {
- synchronize_sched();
- deferred_sync = 0;
+ if (!entry) {
+ entry = add_marker(name, format);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ goto end;
+ }
}
- ret = add_marker(name, format, probe, private);
- if (ret)
+ /*
+ * If we detect that a call_rcu is pending for this marker,
+ * make sure it's executed now.
+ */
+ if (entry->rcu_pending)
+ rcu_barrier();
+ old = marker_entry_add_probe(entry, probe, probe_private);
+ if (IS_ERR(old)) {
+ ret = PTR_ERR(old);
goto end;
+ }
mutex_unlock(&markers_mutex);
- marker_update_probes(NULL);
- return ret;
+ marker_update_probes(); /* may update entry */
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ WARN_ON(!entry);
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+#ifdef CONFIG_PREEMPT_RCU
+ synchronize_sched(); /* Until we have the call_rcu_sched() */
+#endif
+ call_rcu(&entry->rcu, free_old_closure);
end:
mutex_unlock(&markers_mutex);
return ret;
@@ -346,171 +684,173 @@ EXPORT_SYMBOL_GPL(marker_probe_register);
/**
* marker_probe_unregister - Disconnect a probe from a marker
* @name: marker name
+ * @probe: probe function pointer
+ * @probe_private: probe private data
*
* Returns the private data given to marker_probe_register, or an ERR_PTR().
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
*/
-void *marker_probe_unregister(const char *name)
+int marker_probe_unregister(const char *name,
+ marker_probe_func *probe, void *probe_private)
{
- struct module *probe_module;
struct marker_entry *entry;
- void *private;
+ struct marker_probe_closure *old;
+ int ret = -ENOENT;
mutex_lock(&markers_mutex);
entry = get_marker(name);
- if (!entry) {
- private = ERR_PTR(-ENOENT);
+ if (!entry)
goto end;
- }
- entry->refcount = 0;
- /* In what module is the probe handler ? */
- probe_module = __module_text_address((unsigned long)entry->probe);
- private = remove_marker(name);
- deferred_sync = 1;
+ if (entry->rcu_pending)
+ rcu_barrier();
+ old = marker_entry_remove_probe(entry, probe, probe_private);
mutex_unlock(&markers_mutex);
- marker_update_probes(probe_module);
- return private;
+ marker_update_probes(); /* may update entry */
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ if (!entry)
+ goto end;
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+#ifdef CONFIG_PREEMPT_RCU
+ synchronize_sched(); /* Until we have the call_rcu_sched() */
+#endif
+ call_rcu(&entry->rcu, free_old_closure);
+ remove_marker(name); /* Ignore busy error message */
+ ret = 0;
end:
mutex_unlock(&markers_mutex);
- return private;
+ return ret;
}
EXPORT_SYMBOL_GPL(marker_probe_unregister);
-/**
- * marker_probe_unregister_private_data - Disconnect a probe from a marker
- * @private: probe private data
- *
- * Unregister a marker by providing the registered private data.
- * Returns the private data given to marker_probe_register, or an ERR_PTR().
- */
-void *marker_probe_unregister_private_data(void *private)
+static struct marker_entry *
+get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
{
- struct module *probe_module;
- struct hlist_head *head;
- struct hlist_node *node;
struct marker_entry *entry;
- int found = 0;
unsigned int i;
+ struct hlist_head *head;
+ struct hlist_node *node;
- mutex_lock(&markers_mutex);
for (i = 0; i < MARKER_TABLE_SIZE; i++) {
head = &marker_table[i];
hlist_for_each_entry(entry, node, head, hlist) {
- if (entry->private == private) {
- found = 1;
- goto iter_end;
+ if (!entry->ptype) {
+ if (entry->single.func == probe
+ && entry->single.probe_private
+ == probe_private)
+ return entry;
+ } else {
+ struct marker_probe_closure *closure;
+ closure = entry->multi;
+ for (i = 0; closure[i].func; i++) {
+ if (closure[i].func == probe &&
+ closure[i].probe_private
+ == probe_private)
+ return entry;
+ }
}
}
}
-iter_end:
- if (!found) {
- private = ERR_PTR(-ENOENT);
- goto end;
- }
- entry->refcount = 0;
- /* In what module is the probe handler ? */
- probe_module = __module_text_address((unsigned long)entry->probe);
- private = remove_marker(entry->name);
- deferred_sync = 1;
- mutex_unlock(&markers_mutex);
- marker_update_probes(probe_module);
- return private;
-end:
- mutex_unlock(&markers_mutex);
- return private;
+ return NULL;
}
-EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
/**
- * marker_arm - Arm a marker
- * @name: marker name
+ * marker_probe_unregister_private_data - Disconnect a probe from a marker
+ * @probe: probe function
+ * @probe_private: probe private data
*
- * Activate a marker. It keeps a reference count of the number of
- * arming/disarming done.
- * Returns 0 if ok, error value on error.
+ * Unregister a probe by providing the registered private data.
+ * Only removes the first marker found in hash table.
+ * Return 0 on success or error value.
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
*/
-int marker_arm(const char *name)
+int marker_probe_unregister_private_data(marker_probe_func *probe,
+ void *probe_private)
{
struct marker_entry *entry;
int ret = 0;
+ struct marker_probe_closure *old;
mutex_lock(&markers_mutex);
- entry = get_marker(name);
+ entry = get_marker_from_private_data(probe, probe_private);
if (!entry) {
ret = -ENOENT;
goto end;
}
- /*
- * Only need to update probes when refcount passes from 0 to 1.
- */
- if (entry->refcount++)
- goto end;
-end:
+ if (entry->rcu_pending)
+ rcu_barrier();
+ old = marker_entry_remove_probe(entry, NULL, probe_private);
mutex_unlock(&markers_mutex);
- marker_update_probes(NULL);
- return ret;
-}
-EXPORT_SYMBOL_GPL(marker_arm);
-
-/**
- * marker_disarm - Disarm a marker
- * @name: marker name
- *
- * Disarm a marker. It keeps a reference count of the number of arming/disarming
- * done.
- * Returns 0 if ok, error value on error.
- */
-int marker_disarm(const char *name)
-{
- struct marker_entry *entry;
- int ret = 0;
-
+ marker_update_probes(); /* may update entry */
mutex_lock(&markers_mutex);
- entry = get_marker(name);
- if (!entry) {
- ret = -ENOENT;
- goto end;
- }
- /*
- * Only permit decrement refcount if higher than 0.
- * Do probe update only on 1 -> 0 transition.
- */
- if (entry->refcount) {
- if (--entry->refcount)
- goto end;
- } else {
- ret = -EPERM;
- goto end;
- }
+ entry = get_marker_from_private_data(probe, probe_private);
+ WARN_ON(!entry);
+ entry->oldptr = old;
+ entry->rcu_pending = 1;
+ /* write rcu_pending before calling the RCU callback */
+ smp_wmb();
+#ifdef CONFIG_PREEMPT_RCU
+ synchronize_sched(); /* Until we have the call_rcu_sched() */
+#endif
+ call_rcu(&entry->rcu, free_old_closure);
+ remove_marker(entry->name); /* Ignore busy error message */
end:
mutex_unlock(&markers_mutex);
- marker_update_probes(NULL);
return ret;
}
-EXPORT_SYMBOL_GPL(marker_disarm);
+EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
/**
* marker_get_private_data - Get a marker's probe private data
* @name: marker name
+ * @probe: probe to match
+ * @num: get the nth matching probe's private data
*
+ * Returns the nth private data pointer (starting from 0) matching, or an
+ * ERR_PTR.
* Returns the private data pointer, or an ERR_PTR.
* The private data pointer should _only_ be dereferenced if the caller is the
* owner of the data, or its content could vanish. This is mostly used to
* confirm that a caller is the owner of a registered probe.
*/
-void *marker_get_private_data(const char *name)
+void *marker_get_private_data(const char *name, marker_probe_func *probe,
+ int num)
{
struct hlist_head *head;
struct hlist_node *node;
struct marker_entry *e;
size_t name_len = strlen(name) + 1;
u32 hash = jhash(name, name_len-1, 0);
- int found = 0;
+ int i;
head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
hlist_for_each_entry(e, node, head, hlist) {
if (!strcmp(name, e->name)) {
- found = 1;
- return e->private;
+ if (!e->ptype) {
+ if (num == 0 && e->single.func == probe)
+ return e->single.probe_private;
+ else
+ break;
+ } else {
+ struct marker_probe_closure *closure;
+ int match = 0;
+ closure = e->multi;
+ for (i = 0; closure[i].func; i++) {
+ if (closure[i].func != probe)
+ continue;
+ if (match++ == num)
+ return closure[i].probe_private;
+ }
+ }
}
}
return ERR_PTR(-ENOENT);
diff --git a/kernel/module.c b/kernel/module.c
index 4202da97a1da..5d437bffd8dc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -987,12 +987,11 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
return ret;
}
-
/*
* /sys/module/foo/sections stuff
* J. Corbet <corbet@lwn.net>
*/
-#ifdef CONFIG_KALLSYMS
+#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
static ssize_t module_sect_show(struct module_attribute *mattr,
struct module *mod, char *buf)
{
@@ -1188,7 +1187,7 @@ static inline void add_notes_attrs(struct module *mod, unsigned int nsect,
static inline void remove_notes_attrs(struct module *mod)
{
}
-#endif /* CONFIG_KALLSYMS */
+#endif
#ifdef CONFIG_SYSFS
int module_add_modinfo_attrs(struct module *mod)
@@ -1231,9 +1230,7 @@ void module_remove_modinfo_attrs(struct module *mod)
}
kfree(mod->modinfo_attrs);
}
-#endif
-#ifdef CONFIG_SYSFS
int mod_sysfs_init(struct module *mod)
{
int err;
@@ -1936,8 +1933,15 @@ static struct module *load_module(void __user *umod,
/* Set up license info based on the info section */
set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
+ /*
+ * ndiswrapper is under GPL by itself, but loads proprietary modules.
+ * Don't use add_taint_module(), as it would prevent ndiswrapper from
+ * using GPL-only symbols it needs.
+ */
if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ add_taint(TAINT_PROPRIETARY_MODULE);
+
+ /* driverloader was caught wrongly pretending to be under GPL */
if (strcmp(mod->name, "driverloader") == 0)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -2038,7 +2042,7 @@ static struct module *load_module(void __user *umod,
#ifdef CONFIG_MARKERS
if (!mod->taints)
marker_update_probe_range(mod->markers,
- mod->markers + mod->num_markers, NULL, NULL);
+ mod->markers + mod->num_markers);
#endif
err = module_finalize(hdr, sechdrs, mod);
if (err < 0)
@@ -2174,10 +2178,20 @@ sys_init_module(void __user *umod,
wake_up(&module_wq);
return ret;
}
+ if (ret > 0) {
+ printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
+ "it should follow 0/-E convention\n"
+ KERN_WARNING "%s: loading module anyway...\n",
+ __func__, mod->name, ret,
+ __func__);
+ dump_stack();
+ }
- /* Now it's a first class citizen! */
- mutex_lock(&module_mutex);
+ /* Now it's a first class citizen! Wake up anyone waiting for it. */
mod->state = MODULE_STATE_LIVE;
+ wake_up(&module_wq);
+
+ mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
unwind_remove_table(mod->unwind_info, 1);
@@ -2186,7 +2200,6 @@ sys_init_module(void __user *umod,
mod->init_size = 0;
mod->init_text_size = 0;
mutex_unlock(&module_mutex);
- wake_up(&module_wq);
return 0;
}
@@ -2564,7 +2577,7 @@ EXPORT_SYMBOL(struct_module);
#endif
#ifdef CONFIG_MARKERS
-void module_update_markers(struct module *probe_module, int *refcount)
+void module_update_markers(void)
{
struct module *mod;
@@ -2572,8 +2585,7 @@ void module_update_markers(struct module *probe_module, int *refcount)
list_for_each_entry(mod, &modules, list)
if (!mod->taints)
marker_update_probe_range(mod->markers,
- mod->markers + mod->num_markers,
- probe_module, refcount);
+ mod->markers + mod->num_markers);
mutex_unlock(&module_mutex);
}
#endif
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 022c9c3cee6f..a9b04203a66d 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -767,9 +767,11 @@ common_timer_set(struct k_itimer *timr, int flags,
/* SIGEV_NONE timers are not queued ! See common_timer_get */
if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
/* Setup correct expiry time for relative timers */
- if (mode == HRTIMER_MODE_REL)
- timer->expires = ktime_add(timer->expires,
- timer->base->get_time());
+ if (mode == HRTIMER_MODE_REL) {
+ timer->expires =
+ ktime_add_safe(timer->expires,
+ timer->base->get_time());
+ }
return 0;
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 79833170bb9c..6233f3b4ae66 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -190,7 +190,7 @@ config APM_EMULATION
notification of APM "events" (e.g. battery status change).
In order to use APM, you will need supporting software. For location
- and more information, read <file:Documentation/pm.txt> and the
+ and more information, read <file:Documentation/power/pm.txt> and the
Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 859a8e59773a..14a656cdc652 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -391,7 +391,7 @@ int hibernation_platform_enter(void)
goto Close;
suspend_console();
- error = device_suspend(PMSG_SUSPEND);
+ error = device_suspend(PMSG_HIBERNATE);
if (error)
goto Resume_console;
@@ -404,7 +404,7 @@ int hibernation_platform_enter(void)
goto Finish;
local_irq_disable();
- error = device_power_down(PMSG_SUSPEND);
+ error = device_power_down(PMSG_HIBERNATE);
if (!error) {
hibernation_ops->enter();
/* We should never get here */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7c2118f9597f..f1d0b345c9ba 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -75,22 +75,15 @@ void refrigerator(void)
__set_current_state(save);
}
-static void fake_signal_wake_up(struct task_struct *p, int resume)
+static void fake_signal_wake_up(struct task_struct *p)
{
unsigned long flags;
spin_lock_irqsave(&p->sighand->siglock, flags);
- signal_wake_up(p, resume);
+ signal_wake_up(p, 0);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
-static void send_fake_signal(struct task_struct *p)
-{
- if (task_is_stopped(p))
- force_sig_specific(SIGSTOP, p);
- fake_signal_wake_up(p, task_is_stopped(p));
-}
-
static int has_mm(struct task_struct *p)
{
return (p->mm && !(p->flags & PF_BORROWED_MM));
@@ -121,7 +114,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only)
if (freezing(p)) {
if (has_mm(p)) {
if (!signal_pending(p))
- fake_signal_wake_up(p, 0);
+ fake_signal_wake_up(p);
} else {
if (with_mm_only)
ret = 0;
@@ -135,7 +128,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only)
} else {
if (has_mm(p)) {
set_freeze_flag(p);
- send_fake_signal(p);
+ fake_signal_wake_up(p);
} else {
if (with_mm_only) {
ret = 0;
@@ -182,15 +175,17 @@ static int try_to_freeze_tasks(int freeze_user_space)
if (frozen(p) || !freezeable(p))
continue;
- if (task_is_traced(p) && frozen(p->parent)) {
- cancel_freezing(p);
- continue;
- }
-
if (!freeze_task(p, freeze_user_space))
continue;
- if (!freezer_should_skip(p))
+ /*
+ * Now that we've done set_freeze_flag, don't
+ * perturb a task in TASK_STOPPED or TASK_TRACED.
+ * It is "frozen enough". If the task does wake
+ * up, it will immediately call try_to_freeze.
+ */
+ if (!task_is_stopped_or_traced(p) &&
+ !freezer_should_skip(p))
todo++;
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 95250d7c8d91..5f91a07c4eac 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
* of @bm->cur_zone_bm are updated.
*/
-static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
void **addr, unsigned int *bit_nr)
{
struct zone_bitmap *zone_bm;
@@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
zone_bm = zone_bm->next;
- BUG_ON(!zone_bm);
+ if (!zone_bm)
+ return -EFAULT;
}
bm->cur.zone_bm = zone_bm;
}
@@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
pfn -= bb->start_pfn;
*bit_nr = pfn % BM_BITS_PER_CHUNK;
*addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+ return 0;
}
static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
+ int error;
- memory_bm_find_bit(bm, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
set_bit(bit, addr);
}
+static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+ int error;
+
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ if (!error)
+ set_bit(bit, addr);
+ return error;
+}
+
static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
+ int error;
- memory_bm_find_bit(bm, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
clear_bit(bit, addr);
}
@@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
+ int error;
- memory_bm_find_bit(bm, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
return test_bit(bit, addr);
}
@@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
region->end_pfn << PAGE_SHIFT);
for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
- if (pfn_valid(pfn))
- memory_bm_set_bit(bm, pfn);
+ if (pfn_valid(pfn)) {
+ /*
+ * It is safe to ignore the result of
+ * mem_bm_set_bit_check() here, since we won't
+ * touch the PFNs for which the error is
+ * returned anyway.
+ */
+ mem_bm_set_bit_check(bm, pfn);
+ }
}
}
@@ -875,8 +902,8 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
#endif /* CONFIG_HIGHMEM */
/**
- * saveable - Determine whether a non-highmem page should be included in
- * the suspend image.
+ * saveable_page - Determine whether a non-highmem page should be included
+ * in the suspend image.
*
* We should save the page if it isn't Nosave, and is not in the range
* of pages statically defined as 'unsaveable', and it isn't a part of
@@ -897,7 +924,8 @@ static struct page *saveable_page(unsigned long pfn)
if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
return NULL;
- if (PageReserved(page) && pfn_is_nosave(pfn))
+ if (PageReserved(page)
+ && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
return NULL;
return page;
@@ -938,6 +966,25 @@ static inline void do_copy_page(long *dst, long *src)
*dst++ = *src++;
}
+
+/**
+ * safe_copy_page - check if the page we are going to copy is marked as
+ * present in the kernel page tables (this always is the case if
+ * CONFIG_DEBUG_PAGEALLOC is not set and in that case
+ * kernel_page_present() always returns 'true').
+ */
+static void safe_copy_page(void *dst, struct page *s_page)
+{
+ if (kernel_page_present(s_page)) {
+ do_copy_page(dst, page_address(s_page));
+ } else {
+ kernel_map_pages(s_page, 1, 1);
+ do_copy_page(dst, page_address(s_page));
+ kernel_map_pages(s_page, 1, 0);
+ }
+}
+
+
#ifdef CONFIG_HIGHMEM
static inline struct page *
page_is_saveable(struct zone *zone, unsigned long pfn)
@@ -946,8 +993,7 @@ page_is_saveable(struct zone *zone, unsigned long pfn)
saveable_highmem_page(pfn) : saveable_page(pfn);
}
-static inline void
-copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
{
struct page *s_page, *d_page;
void *src, *dst;
@@ -961,29 +1007,26 @@ copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
kunmap_atomic(src, KM_USER0);
kunmap_atomic(dst, KM_USER1);
} else {
- src = page_address(s_page);
if (PageHighMem(d_page)) {
/* Page pointed to by src may contain some kernel
* data modified by kmap_atomic()
*/
- do_copy_page(buffer, src);
+ safe_copy_page(buffer, s_page);
dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0);
memcpy(dst, buffer, PAGE_SIZE);
kunmap_atomic(dst, KM_USER0);
} else {
- dst = page_address(d_page);
- do_copy_page(dst, src);
+ safe_copy_page(page_address(d_page), s_page);
}
}
}
#else
#define page_is_saveable(zone, pfn) saveable_page(pfn)
-static inline void
-copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
+static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
{
- do_copy_page(page_address(pfn_to_page(dst_pfn)),
- page_address(pfn_to_page(src_pfn)));
+ safe_copy_page(page_address(pfn_to_page(dst_pfn)),
+ pfn_to_page(src_pfn));
}
#endif /* CONFIG_HIGHMEM */
diff --git a/kernel/printk.c b/kernel/printk.c
index bee36100f110..bdd4ea8c3f2b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -616,6 +616,53 @@ asmlinkage int printk(const char *fmt, ...)
/* cpu currently holding logbuf_lock */
static volatile unsigned int printk_cpu = UINT_MAX;
+/*
+ * Can we actually use the console at this time on this cpu?
+ *
+ * Console drivers may assume that per-cpu resources have
+ * been allocated. So unless they're explicitly marked as
+ * being able to cope (CON_ANYTIME) don't call them until
+ * this CPU is officially up.
+ */
+static inline int can_use_console(unsigned int cpu)
+{
+ return cpu_online(cpu) || have_callable_console();
+}
+
+/*
+ * Try to get console ownership to actually show the kernel
+ * messages from a 'printk'. Return true (and with the
+ * console_semaphore held, and 'console_locked' set) if it
+ * is successful, false otherwise.
+ *
+ * This gets called with the 'logbuf_lock' spinlock held and
+ * interrupts disabled. It should return with 'lockbuf_lock'
+ * released but interrupts still disabled.
+ */
+static int acquire_console_semaphore_for_printk(unsigned int cpu)
+{
+ int retval = 0;
+
+ if (!try_acquire_console_sem()) {
+ retval = 1;
+
+ /*
+ * If we can't use the console, we need to release
+ * the console semaphore by hand to avoid flushing
+ * the buffer. We need to hold the console semaphore
+ * in order to do this test safely.
+ */
+ if (!can_use_console(cpu)) {
+ console_locked = 0;
+ up(&console_sem);
+ retval = 0;
+ }
+ }
+ printk_cpu = UINT_MAX;
+ spin_unlock(&logbuf_lock);
+ return retval;
+}
+
const char printk_recursion_bug_msg [] =
KERN_CRIT "BUG: recent printk recursion!\n";
static int printk_recursion_bug;
@@ -666,7 +713,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
}
/* Emit the output into the temporary buffer */
printed_len += vscnprintf(printk_buf + printed_len,
- sizeof(printk_buf), fmt, args);
+ sizeof(printk_buf) - printed_len, fmt, args);
/*
* Copy the output into log_buf. If the caller didn't provide
@@ -725,43 +772,22 @@ asmlinkage int vprintk(const char *fmt, va_list args)
log_level_unknown = 1;
}
- if (!down_trylock(&console_sem)) {
- /*
- * We own the drivers. We can drop the spinlock and
- * let release_console_sem() print the text, maybe ...
- */
- console_locked = 1;
- printk_cpu = UINT_MAX;
- spin_unlock(&logbuf_lock);
+ /*
+ * Try to acquire and then immediately release the
+ * console semaphore. The release will do all the
+ * actual magic (print out buffers, wake up klogd,
+ * etc).
+ *
+ * The acquire_console_semaphore_for_printk() function
+ * will release 'logbuf_lock' regardless of whether it
+ * actually gets the semaphore or not.
+ */
+ if (acquire_console_semaphore_for_printk(this_cpu))
+ release_console_sem();
- /*
- * Console drivers may assume that per-cpu resources have
- * been allocated. So unless they're explicitly marked as
- * being able to cope (CON_ANYTIME) don't call them until
- * this CPU is officially up.
- */
- if (cpu_online(smp_processor_id()) || have_callable_console()) {
- console_may_schedule = 0;
- release_console_sem();
- } else {
- /* Release by hand to avoid flushing the buffer. */
- console_locked = 0;
- up(&console_sem);
- }
- lockdep_on();
- raw_local_irq_restore(flags);
- } else {
- /*
- * Someone else owns the drivers. We drop the spinlock, which
- * allows the semaphore holder to proceed and to call the
- * console drivers with the output which we just produced.
- */
- printk_cpu = UINT_MAX;
- spin_unlock(&logbuf_lock);
- lockdep_on();
+ lockdep_on();
out_restore_irqs:
- raw_local_irq_restore(flags);
- }
+ raw_local_irq_restore(flags);
preempt_enable();
return printed_len;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 760dfc233a00..c09605f8d16c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -56,7 +56,10 @@ static atomic_t rcu_barrier_cpu_count;
static DEFINE_MUTEX(rcu_barrier_mutex);
static struct completion rcu_barrier_completion;
-/* Because of FASTCALL declaration of complete, we use this wrapper */
+/*
+ * Awaken the corresponding synchronize_rcu() instance now that a
+ * grace period has elapsed.
+ */
static void wakeme_after_rcu(struct rcu_head *head)
{
struct rcu_synchronize *rcu;
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 987cfb7ade89..e9517014b57c 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -23,6 +23,10 @@
* to Suparna Bhattacharya for pushing me completely away
* from atomic instructions on the read side.
*
+ * - Added handling of Dynamic Ticks
+ * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
+ * - Steven Rostedt <srostedt@redhat.com>
+ *
* Papers: http://www.rdrop.com/users/paulmck/RCU
*
* Design Document: http://lwn.net/Articles/253651/
@@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
}
}
+#ifdef CONFIG_NO_HZ
+
+DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
+static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
+static DEFINE_PER_CPU(int, rcu_update_flag);
+
+/**
+ * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * dynticks_progress_counter to let the RCU handling know that the
+ * CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+ int cpu = smp_processor_id();
+
+ if (per_cpu(rcu_update_flag, cpu))
+ per_cpu(rcu_update_flag, cpu)++;
+
+ /*
+ * Only update if we are coming from a stopped ticks mode
+ * (dynticks_progress_counter is even).
+ */
+ if (!in_interrupt() &&
+ (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
+ /*
+ * The following might seem like we could have a race
+ * with NMI/SMIs. But this really isn't a problem.
+ * Here we do a read/modify/write, and the race happens
+ * when an NMI/SMI comes in after the read and before
+ * the write. But NMI/SMIs will increment this counter
+ * twice before returning, so the zero bit will not
+ * be corrupted by the NMI/SMI which is the most important
+ * part.
+ *
+ * The only thing is that we would bring back the counter
+ * to a postion that it was in during the NMI/SMI.
+ * But the zero bit would be set, so the rest of the
+ * counter would again be ignored.
+ *
+ * On return from the IRQ, the counter may have the zero
+ * bit be 0 and the counter the same as the return from
+ * the NMI/SMI. If the state machine was so unlucky to
+ * see that, it still doesn't matter, since all
+ * RCU read-side critical sections on this CPU would
+ * have already completed.
+ */
+ per_cpu(dynticks_progress_counter, cpu)++;
+ /*
+ * The following memory barrier ensures that any
+ * rcu_read_lock() primitives in the irq handler
+ * are seen by other CPUs to follow the above
+ * increment to dynticks_progress_counter. This is
+ * required in order for other CPUs to correctly
+ * determine when it is safe to advance the RCU
+ * grace-period state machine.
+ */
+ smp_mb(); /* see above block comment. */
+ /*
+ * Since we can't determine the dynamic tick mode from
+ * the dynticks_progress_counter after this routine,
+ * we use a second flag to acknowledge that we came
+ * from an idle state with ticks stopped.
+ */
+ per_cpu(rcu_update_flag, cpu)++;
+ /*
+ * If we take an NMI/SMI now, they will also increment
+ * the rcu_update_flag, and will not update the
+ * dynticks_progress_counter on exit. That is for
+ * this IRQ to do.
+ */
+ }
+}
+
+/**
+ * rcu_irq_exit - Called from exiting Hard irq context.
+ *
+ * If the CPU was idle with dynamic ticks active, update the
+ * dynticks_progress_counter to put let the RCU handling be
+ * aware that the CPU is going back to idle with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+ int cpu = smp_processor_id();
+
+ /*
+ * rcu_update_flag is set if we interrupted the CPU
+ * when it was idle with ticks stopped.
+ * Once this occurs, we keep track of interrupt nesting
+ * because a NMI/SMI could also come in, and we still
+ * only want the IRQ that started the increment of the
+ * dynticks_progress_counter to be the one that modifies
+ * it on exit.
+ */
+ if (per_cpu(rcu_update_flag, cpu)) {
+ if (--per_cpu(rcu_update_flag, cpu))
+ return;
+
+ /* This must match the interrupt nesting */
+ WARN_ON(in_interrupt());
+
+ /*
+ * If an NMI/SMI happens now we are still
+ * protected by the dynticks_progress_counter being odd.
+ */
+
+ /*
+ * The following memory barrier ensures that any
+ * rcu_read_unlock() primitives in the irq handler
+ * are seen by other CPUs to preceed the following
+ * increment to dynticks_progress_counter. This
+ * is required in order for other CPUs to determine
+ * when it is safe to advance the RCU grace-period
+ * state machine.
+ */
+ smp_mb(); /* see above block comment. */
+ per_cpu(dynticks_progress_counter, cpu)++;
+ WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+ }
+}
+
+static void dyntick_save_progress_counter(int cpu)
+{
+ per_cpu(rcu_dyntick_snapshot, cpu) =
+ per_cpu(dynticks_progress_counter, cpu);
+}
+
+static inline int
+rcu_try_flip_waitack_needed(int cpu)
+{
+ long curr;
+ long snap;
+
+ curr = per_cpu(dynticks_progress_counter, cpu);
+ snap = per_cpu(rcu_dyntick_snapshot, cpu);
+ smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+ /*
+ * If the CPU remained in dynticks mode for the entire time
+ * and didn't take any interrupts, NMIs, SMIs, or whatever,
+ * then it cannot be in the middle of an rcu_read_lock(), so
+ * the next rcu_read_lock() it executes must use the new value
+ * of the counter. So we can safely pretend that this CPU
+ * already acknowledged the counter.
+ */
+
+ if ((curr == snap) && ((curr & 0x1) == 0))
+ return 0;
+
+ /*
+ * If the CPU passed through or entered a dynticks idle phase with
+ * no active irq handlers, then, as above, we can safely pretend
+ * that this CPU already acknowledged the counter.
+ */
+
+ if ((curr - snap) > 2 || (snap & 0x1) == 0)
+ return 0;
+
+ /* We need this CPU to explicitly acknowledge the counter flip. */
+
+ return 1;
+}
+
+static inline int
+rcu_try_flip_waitmb_needed(int cpu)
+{
+ long curr;
+ long snap;
+
+ curr = per_cpu(dynticks_progress_counter, cpu);
+ snap = per_cpu(rcu_dyntick_snapshot, cpu);
+ smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+ /*
+ * If the CPU remained in dynticks mode for the entire time
+ * and didn't take any interrupts, NMIs, SMIs, or whatever,
+ * then it cannot have executed an RCU read-side critical section
+ * during that time, so there is no need for it to execute a
+ * memory barrier.
+ */
+
+ if ((curr == snap) && ((curr & 0x1) == 0))
+ return 0;
+
+ /*
+ * If the CPU either entered or exited an outermost interrupt,
+ * SMI, NMI, or whatever handler, then we know that it executed
+ * a memory barrier when doing so. So we don't need another one.
+ */
+ if (curr != snap)
+ return 0;
+
+ /* We need the CPU to execute a memory barrier. */
+
+ return 1;
+}
+
+#else /* !CONFIG_NO_HZ */
+
+# define dyntick_save_progress_counter(cpu) do { } while (0)
+# define rcu_try_flip_waitack_needed(cpu) (1)
+# define rcu_try_flip_waitmb_needed(cpu) (1)
+
+#endif /* CONFIG_NO_HZ */
+
/*
* Get here when RCU is idle. Decide whether we need to
* move out of idle state, and return non-zero if so.
@@ -447,8 +657,10 @@ rcu_try_flip_idle(void)
/* Now ask each CPU for acknowledgement of the flip. */
- for_each_cpu_mask(cpu, rcu_cpu_online_map)
+ for_each_cpu_mask(cpu, rcu_cpu_online_map) {
per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
+ dyntick_save_progress_counter(cpu);
+ }
return 1;
}
@@ -464,7 +676,8 @@ rcu_try_flip_waitack(void)
RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
for_each_cpu_mask(cpu, rcu_cpu_online_map)
- if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
+ if (rcu_try_flip_waitack_needed(cpu) &&
+ per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
return 0;
}
@@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void)
smp_mb(); /* ^^^^^^^^^^^^ */
/* Call for a memory barrier from each CPU. */
- for_each_cpu_mask(cpu, rcu_cpu_online_map)
+ for_each_cpu_mask(cpu, rcu_cpu_online_map) {
per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
+ dyntick_save_progress_counter(cpu);
+ }
RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
return 1;
@@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void)
RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
for_each_cpu_mask(cpu, rcu_cpu_online_map)
- if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
+ if (rcu_try_flip_waitmb_needed(cpu) &&
+ per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
return 0;
}
@@ -702,8 +918,9 @@ void rcu_offline_cpu(int cpu)
* fix.
*/
+ local_irq_save(flags);
rdp = RCU_DATA_ME();
- spin_lock_irqsave(&rdp->lock, flags);
+ spin_lock(&rdp->lock);
*rdp->nexttail = list;
if (list)
rdp->nexttail = tail;
@@ -735,9 +952,11 @@ static void rcu_process_callbacks(struct softirq_action *unused)
{
unsigned long flags;
struct rcu_head *next, *list;
- struct rcu_data *rdp = RCU_DATA_ME();
+ struct rcu_data *rdp;
- spin_lock_irqsave(&rdp->lock, flags);
+ local_irq_save(flags);
+ rdp = RCU_DATA_ME();
+ spin_lock(&rdp->lock);
list = rdp->donelist;
if (list == NULL) {
spin_unlock_irqrestore(&rdp->lock, flags);
diff --git a/kernel/relay.c b/kernel/relay.c
index d080b9d161a7..d6204a485818 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -736,7 +736,7 @@ static int relay_file_open(struct inode *inode, struct file *filp)
kref_get(&buf->kref);
filp->private_data = buf;
- return 0;
+ return nonseekable_open(inode, filp);
}
/**
@@ -1056,6 +1056,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = {
.get = generic_pipe_buf_get,
};
+static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+}
+
/*
* subbuf_splice_actor - splice up to one subbuf's worth of data
*/
@@ -1066,7 +1070,7 @@ static int subbuf_splice_actor(struct file *in,
unsigned int flags,
int *nonpad_ret)
{
- unsigned int pidx, poff, total_len, subbuf_pages, ret;
+ unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
struct rchan_buf *rbuf = in->private_data;
unsigned int subbuf_size = rbuf->chan->subbuf_size;
uint64_t pos = (uint64_t) *ppos;
@@ -1083,6 +1087,7 @@ static int subbuf_splice_actor(struct file *in,
.partial = partial,
.flags = flags,
.ops = &relay_pipe_buf_ops,
+ .spd_release = relay_page_release,
};
if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
@@ -1097,8 +1102,9 @@ static int subbuf_splice_actor(struct file *in,
subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
pidx = (read_start / PAGE_SIZE) % subbuf_pages;
poff = read_start & ~PAGE_MASK;
+ nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
- for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) {
+ for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
unsigned int this_len, this_end, private;
unsigned int cur_pos = read_start + total_len;
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 16cbec2d5d60..efbfc0fc232f 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -113,6 +113,7 @@ ssize_t res_counter_write(struct res_counter *counter, int member,
ret = -EINVAL;
+ strstrip(buf);
if (write_strategy) {
if (write_strategy(buf, &tmp)) {
goto out_free;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 0deef71ff8d2..6522ae5b14a2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(state);
/* Setup the timer, when timeout != NULL */
- if (unlikely(timeout))
+ if (unlikely(timeout)) {
hrtimer_start(&timeout->timer, timeout->timer.expires,
HRTIMER_MODE_ABS);
+ if (!hrtimer_active(&timeout->timer))
+ timeout->task = NULL;
+ }
for (;;) {
/* Try to acquire the lock: */
diff --git a/kernel/sched.c b/kernel/sched.c
index 3eedd5260907..8dcdec6fe0fe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -155,7 +155,7 @@ struct rt_prio_array {
struct list_head queue[MAX_RT_PRIO];
};
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
#include <linux/cgroup.h>
@@ -165,118 +165,88 @@ static LIST_HEAD(task_groups);
/* task group related information */
struct task_group {
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
struct cgroup_subsys_state css;
#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
+ unsigned long shares;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
- unsigned int rt_ratio;
-
- /*
- * shares assigned to a task group governs how much of cpu bandwidth
- * is allocated to the group. The more shares a group has, the more is
- * the cpu bandwidth allocated to it.
- *
- * For ex, lets say that there are three task groups, A, B and C which
- * have been assigned shares 1000, 2000 and 3000 respectively. Then,
- * cpu bandwidth allocated by the scheduler to task groups A, B and C
- * should be:
- *
- * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
- * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
- * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
- *
- * The weight assigned to a task group's schedulable entities on every
- * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
- * group's shares. For ex: lets say that task group A has been
- * assigned shares of 1000 and there are two CPUs in a system. Then,
- *
- * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
- *
- * Note: It's not necessary that each of a task's group schedulable
- * entity have the same weight on all CPUs. If the group
- * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
- * better distribution of weight could be:
- *
- * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
- * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
- *
- * rebalance_shares() is responsible for distributing the shares of a
- * task groups like this among the group's schedulable entities across
- * cpus.
- *
- */
- unsigned long shares;
+ u64 rt_runtime;
+#endif
struct rcu_head rcu;
struct list_head list;
};
+#ifdef CONFIG_FAIR_GROUP_SCHED
/* Default task group's sched entity on each cpu */
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
-static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
-
static struct sched_entity *init_sched_entity_p[NR_CPUS];
static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
static struct rt_rq *init_rt_rq_p[NR_CPUS];
+#endif
-/* task_group_mutex serializes add/remove of task groups and also changes to
+/* task_group_lock serializes add/remove of task groups and also changes to
* a task group's cpu shares.
*/
-static DEFINE_MUTEX(task_group_mutex);
+static DEFINE_SPINLOCK(task_group_lock);
/* doms_cur_mutex serializes access to doms_cur[] array */
static DEFINE_MUTEX(doms_cur_mutex);
-#ifdef CONFIG_SMP
-/* kernel thread that runs rebalance_shares() periodically */
-static struct task_struct *lb_monitor_task;
-static int load_balance_monitor(void *unused);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_USER_SCHED
+# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
+#else
+# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
#endif
-static void set_se_shares(struct sched_entity *se, unsigned long shares);
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+#endif
/* Default task group.
* Every task in system belong to this group at bootup.
*/
struct task_group init_task_group = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
.se = init_sched_entity_p,
.cfs_rq = init_cfs_rq_p,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
.rt_se = init_sched_rt_entity_p,
.rt_rq = init_rt_rq_p,
-};
-
-#ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
-#else
-# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
#endif
-
-#define MIN_GROUP_SHARES 2
-
-static int init_task_group_load = INIT_TASK_GROUP_LOAD;
+};
/* return group to which a task belongs */
static inline struct task_group *task_group(struct task_struct *p)
{
struct task_group *tg;
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
tg = p->user->tg;
-#elif defined(CONFIG_FAIR_CGROUP_SCHED)
+#elif defined(CONFIG_CGROUP_SCHED)
tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
struct task_group, css);
#else
@@ -288,21 +258,15 @@ static inline struct task_group *task_group(struct task_struct *p)
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
+#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
p->se.parent = task_group(p)->se[cpu];
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
p->rt.rt_rq = task_group(p)->rt_rq[cpu];
p->rt.parent = task_group(p)->rt_se[cpu];
-}
-
-static inline void lock_task_group_list(void)
-{
- mutex_lock(&task_group_mutex);
-}
-
-static inline void unlock_task_group_list(void)
-{
- mutex_unlock(&task_group_mutex);
+#endif
}
static inline void lock_doms_cur(void)
@@ -318,12 +282,10 @@ static inline void unlock_doms_cur(void)
#else
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline void lock_task_group_list(void) { }
-static inline void unlock_task_group_list(void) { }
static inline void lock_doms_cur(void) { }
static inline void unlock_doms_cur(void) { }
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_GROUP_SCHED */
/* CFS-related fields in a runqueue */
struct cfs_rq {
@@ -339,7 +301,7 @@ struct cfs_rq {
/* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr;
+ struct sched_entity *curr, *next;
unsigned long nr_spread_over;
@@ -363,7 +325,7 @@ struct cfs_rq {
struct rt_rq {
struct rt_prio_array active;
unsigned long rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
int highest_prio; /* highest queued rt task prio */
#endif
#ifdef CONFIG_SMP
@@ -373,7 +335,9 @@ struct rt_rq {
int rt_throttled;
u64 rt_time;
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
+ unsigned long rt_nr_boosted;
+
struct rq *rq;
struct list_head leaf_rt_rq_list;
struct task_group *tg;
@@ -447,6 +411,8 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list;
#endif
@@ -628,18 +594,14 @@ enum {
SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
SCHED_FEAT_WAKEUP_PREEMPT = 2,
SCHED_FEAT_START_DEBIT = 4,
- SCHED_FEAT_TREE_AVG = 8,
- SCHED_FEAT_APPROX_AVG = 16,
- SCHED_FEAT_HRTICK = 32,
- SCHED_FEAT_DOUBLE_TICK = 64,
+ SCHED_FEAT_HRTICK = 8,
+ SCHED_FEAT_DOUBLE_TICK = 16,
};
const_debug unsigned int sysctl_sched_features =
SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 |
SCHED_FEAT_WAKEUP_PREEMPT * 1 |
SCHED_FEAT_START_DEBIT * 1 |
- SCHED_FEAT_TREE_AVG * 0 |
- SCHED_FEAT_APPROX_AVG * 0 |
SCHED_FEAT_HRTICK * 1 |
SCHED_FEAT_DOUBLE_TICK * 0;
@@ -652,19 +614,23 @@ const_debug unsigned int sysctl_sched_features =
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
* default: 1s
*/
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+unsigned int sysctl_sched_rt_period = 1000000;
-#define SCHED_RT_FRAC_SHIFT 16
-#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
+static __read_mostly int scheduler_running;
/*
- * ratio of time -rt tasks may consume.
- * default: 95%
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
*/
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+int sysctl_sched_rt_runtime = 950000;
+
+/*
+ * single value that denotes runtime == period, ie unlimited time.
+ */
+#define RUNTIME_INF ((u64)~0ULL)
/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -676,14 +642,16 @@ unsigned long long cpu_clock(int cpu)
unsigned long flags;
struct rq *rq;
- local_irq_save(flags);
- rq = cpu_rq(cpu);
/*
* Only call sched_clock() if the scheduler has already been
* initialized (some code might call cpu_clock() very early):
*/
- if (rq->idle)
- update_rq_clock(rq);
+ if (unlikely(!scheduler_running))
+ return 0;
+
+ local_irq_save(flags);
+ rq = cpu_rq(cpu);
+ update_rq_clock(rq);
now = rq->clock;
local_irq_restore(flags);
@@ -1084,6 +1052,49 @@ static void resched_cpu(int cpu)
resched_task(cpu_curr(cpu));
spin_unlock_irqrestore(&rq->lock, flags);
}
+
+#ifdef CONFIG_NO_HZ
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (cpu == smp_processor_id())
+ return;
+
+ /*
+ * This is safe, as this function is called with the timer
+ * wheel base lock of (cpu) held. When the CPU is on the way
+ * to idle and has not yet set rq->curr to idle then it will
+ * be serialized on the timer wheel base lock and take the new
+ * timer into account automatically.
+ */
+ if (rq->curr != rq->idle)
+ return;
+
+ /*
+ * We can set TIF_RESCHED on the idle task of the other CPU
+ * lockless. The worst case is that the other CPU runs the
+ * idle task through an additional NOOP schedule()
+ */
+ set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+
+ /* NEED_RESCHED must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(rq->idle))
+ smp_send_reschedule(cpu);
+}
+#endif
+
#else
static void __resched_task(struct task_struct *p, int tif_bit)
{
@@ -1112,7 +1123,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
u64 tmp;
if (unlikely(!lw->inv_weight))
- lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
+ lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
tmp = (u64)delta_exec * weight;
/*
@@ -1136,11 +1147,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
+ lw->inv_weight = 0;
}
static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
+ lw->inv_weight = 0;
}
/*
@@ -1228,16 +1241,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
#endif
-static inline void inc_cpu_load(struct rq *rq, unsigned long load)
-{
- update_load_add(&rq->load, load);
-}
-
-static inline void dec_cpu_load(struct rq *rq, unsigned long load)
-{
- update_load_sub(&rq->load, load);
-}
-
#ifdef CONFIG_SMP
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
@@ -1255,14 +1258,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
#define sched_class_highest (&rt_sched_class)
-static void inc_nr_running(struct rq *rq)
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+ update_load_add(&rq->load, p->se.load.weight);
+}
+
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+ update_load_sub(&rq->load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running++;
+ inc_load(rq, p);
}
-static void dec_nr_running(struct rq *rq)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running--;
+ dec_load(rq, p);
}
static void set_load_weight(struct task_struct *p)
@@ -1354,7 +1369,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
rq->nr_uninterruptible--;
enqueue_task(rq, p, wakeup);
- inc_nr_running(rq);
+ inc_nr_running(p, rq);
}
/*
@@ -1366,7 +1381,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
rq->nr_uninterruptible++;
dequeue_task(rq, p, sleep);
- dec_nr_running(rq);
+ dec_nr_running(p, rq);
}
/**
@@ -1420,6 +1435,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
{
s64 delta;
+ /*
+ * Buddy candidates are cache hot:
+ */
+ if (&p->se == cfs_rq_of(&p->se)->next)
+ return 1;
+
if (p->sched_class != &fair_sched_class)
return 0;
@@ -1818,6 +1839,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
long old_state;
struct rq *rq;
+ smp_wmb();
rq = task_rq_lock(p, &flags);
old_state = p->state;
if (!(old_state & state))
@@ -1878,10 +1900,11 @@ out_activate:
schedstat_inc(p, se.nr_wakeups_remote);
update_rq_clock(rq);
activate_task(rq, p, 1);
- check_preempt_curr(rq, p);
success = 1;
out_running:
+ check_preempt_curr(rq, p);
+
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
@@ -1915,6 +1938,8 @@ static void __sched_fork(struct task_struct *p)
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
+ p->se.last_wakeup = 0;
+ p->se.avg_overlap = 0;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
@@ -2005,7 +2030,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
* management (if any):
*/
p->sched_class->task_new(rq, p);
- inc_nr_running(rq);
+ inc_nr_running(p, rq);
}
check_preempt_curr(rq, p);
#ifdef CONFIG_SMP
@@ -3753,7 +3778,7 @@ void scheduler_tick(void)
#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
-void add_preempt_count(int val)
+void __kprobes add_preempt_count(int val)
{
/*
* Underflow?
@@ -3769,7 +3794,7 @@ void add_preempt_count(int val)
}
EXPORT_SYMBOL(add_preempt_count);
-void sub_preempt_count(int val)
+void __kprobes sub_preempt_count(int val)
{
/*
* Underflow?
@@ -3871,7 +3896,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
asmlinkage void __sched schedule(void)
{
struct task_struct *prev, *next;
- long *switch_count;
+ unsigned long *switch_count;
struct rq *rq;
int cpu;
@@ -3900,7 +3925,7 @@ need_resched_nonpreemptible:
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
- unlikely(signal_pending(prev)))) {
+ signal_pending(prev))) {
prev->state = TASK_RUNNING;
} else {
deactivate_task(rq, prev, 1);
@@ -4293,11 +4318,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
oldprio = p->prio;
on_rq = p->se.on_rq;
running = task_current(rq, p);
- if (on_rq) {
+ if (on_rq)
dequeue_task(rq, p, 0);
- if (running)
- p->sched_class->put_prev_task(rq, p);
- }
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
if (rt_prio(prio))
p->sched_class = &rt_sched_class;
@@ -4306,10 +4330,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
p->prio = prio;
+ if (running)
+ p->sched_class->set_curr_task(rq);
if (on_rq) {
- if (running)
- p->sched_class->set_curr_task(rq);
-
enqueue_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio, running);
@@ -4344,8 +4367,10 @@ void set_user_nice(struct task_struct *p, long nice)
goto out_unlock;
}
on_rq = p->se.on_rq;
- if (on_rq)
+ if (on_rq) {
dequeue_task(rq, p, 0);
+ dec_load(rq, p);
+ }
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -4355,6 +4380,7 @@ void set_user_nice(struct task_struct *p, long nice)
if (on_rq) {
enqueue_task(rq, p, 0);
+ inc_load(rq, p);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -4444,7 +4470,7 @@ int task_nice(const struct task_struct *p)
{
return TASK_NICE(p);
}
-EXPORT_SYMBOL_GPL(task_nice);
+EXPORT_SYMBOL(task_nice);
/**
* idle_cpu - is a given cpu idle currently?
@@ -4571,6 +4597,15 @@ recheck:
return -EPERM;
}
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Do not allow realtime tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_policy(policy) && task_group(p)->rt_runtime == 0)
+ return -EPERM;
+#endif
+
retval = security_task_setscheduler(p, policy, param);
if (retval)
return retval;
@@ -4594,19 +4629,17 @@ recheck:
update_rq_clock(rq);
on_rq = p->se.on_rq;
running = task_current(rq, p);
- if (on_rq) {
+ if (on_rq)
deactivate_task(rq, p, 0);
- if (running)
- p->sched_class->put_prev_task(rq, p);
- }
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
oldprio = p->prio;
__setscheduler(rq, p, policy, param->sched_priority);
+ if (running)
+ p->sched_class->set_curr_task(rq);
if (on_rq) {
- if (running)
- p->sched_class->set_curr_task(rq);
-
activate_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio, running);
@@ -5113,7 +5146,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
time_slice = 0;
if (p->policy == SCHED_RR) {
time_slice = DEF_TIMESLICE;
- } else {
+ } else if (p->policy != SCHED_FIFO) {
struct sched_entity *se = &p->se;
unsigned long flags;
struct rq *rq;
@@ -5894,7 +5927,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
spin_unlock_irq(&rq->lock);
break;
- case CPU_DOWN_PREPARE:
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
/* Update our root-domain */
rq = cpu_rq(cpu);
spin_lock_irqsave(&rq->lock, flags);
@@ -6816,6 +6850,10 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */
*/
static cpumask_t fallback_doms;
+void __attribute__((weak)) arch_update_cpu_topology(void)
+{
+}
+
/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
* For now this just excludes isolated cpus, but could be used to
@@ -6825,6 +6863,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
{
int err;
+ arch_update_cpu_topology();
ndoms_cur = 1;
doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
if (!doms_cur)
@@ -6929,7 +6968,7 @@ match2:
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static int arch_reinit_sched_domains(void)
+int arch_reinit_sched_domains(void)
{
int err;
@@ -7060,21 +7099,6 @@ void __init sched_init_smp(void)
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
BUG();
sched_init_granularity();
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (nr_cpu_ids == 1)
- return;
-
- lb_monitor_task = kthread_create(load_balance_monitor, NULL,
- "group_balance");
- if (!IS_ERR(lb_monitor_task)) {
- lb_monitor_task->flags |= PF_NOFREEZE;
- wake_up_process(lb_monitor_task);
- } else {
- printk(KERN_ERR "Could not create load balance monitor thread"
- "(error = %ld) \n", PTR_ERR(lb_monitor_task));
- }
-#endif
}
#else
void __init sched_init_smp(void)
@@ -7112,7 +7136,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
rt_rq->highest_prio = MAX_RT_PRIO;
#endif
#ifdef CONFIG_SMP
@@ -7123,7 +7147,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
rt_rq->rt_time = 0;
rt_rq->rt_throttled = 0;
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq;
#endif
}
@@ -7146,7 +7171,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
se->parent = NULL;
}
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
int cpu, int add)
@@ -7175,7 +7202,7 @@ void __init sched_init(void)
init_defrootdomain();
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
list_add(&init_task_group.list, &task_groups);
#endif
@@ -7196,7 +7223,10 @@ void __init sched_init(void)
&per_cpu(init_cfs_rq, i),
&per_cpu(init_sched_entity, i), i, 1);
- init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ init_task_group.rt_runtime =
+ sysctl_sched_rt_runtime * NSEC_PER_USEC;
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
init_tg_rt_entry(rq, &init_task_group,
&per_cpu(init_rt_rq, i),
@@ -7255,6 +7285,8 @@ void __init sched_init(void)
* During early bootup we pretend to be a normal task:
*/
current->sched_class = &fair_sched_class;
+
+ scheduler_running = 1;
}
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -7303,7 +7335,7 @@ void normalize_rt_tasks(void)
unsigned long flags;
struct rq *rq;
- read_lock_irq(&tasklist_lock);
+ read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) {
/*
* Only normalize user tasks:
@@ -7329,16 +7361,16 @@ void normalize_rt_tasks(void)
continue;
}
- spin_lock_irqsave(&p->pi_lock, flags);
+ spin_lock(&p->pi_lock);
rq = __task_rq_lock(p);
normalize_task(rq, p);
__task_rq_unlock(rq);
- spin_unlock_irqrestore(&p->pi_lock, flags);
+ spin_unlock(&p->pi_lock);
} while_each_thread(g, p);
- read_unlock_irq(&tasklist_lock);
+ read_unlock_irqrestore(&tasklist_lock, flags);
}
#endif /* CONFIG_MAGIC_SYSRQ */
@@ -7387,202 +7419,114 @@ void set_curr_task(int cpu, struct task_struct *p)
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED
-#ifdef CONFIG_SMP
-/*
- * distribute shares of all task groups among their schedulable entities,
- * to reflect load distribution across cpus.
- */
-static int rebalance_shares(struct sched_domain *sd, int this_cpu)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void free_fair_sched_group(struct task_group *tg)
{
- struct cfs_rq *cfs_rq;
- struct rq *rq = cpu_rq(this_cpu);
- cpumask_t sdspan = sd->span;
- int balanced = 1;
-
- /* Walk thr' all the task groups that we have */
- for_each_leaf_cfs_rq(rq, cfs_rq) {
- int i;
- unsigned long total_load = 0, total_shares;
- struct task_group *tg = cfs_rq->tg;
-
- /* Gather total task load of this group across cpus */
- for_each_cpu_mask(i, sdspan)
- total_load += tg->cfs_rq[i]->load.weight;
-
- /* Nothing to do if this group has no load */
- if (!total_load)
- continue;
-
- /*
- * tg->shares represents the number of cpu shares the task group
- * is eligible to hold on a single cpu. On N cpus, it is
- * eligible to hold (N * tg->shares) number of cpu shares.
- */
- total_shares = tg->shares * cpus_weight(sdspan);
-
- /*
- * redistribute total_shares across cpus as per the task load
- * distribution.
- */
- for_each_cpu_mask(i, sdspan) {
- unsigned long local_load, local_shares;
-
- local_load = tg->cfs_rq[i]->load.weight;
- local_shares = (local_load * total_shares) / total_load;
- if (!local_shares)
- local_shares = MIN_GROUP_SHARES;
- if (local_shares == tg->se[i]->load.weight)
- continue;
+ int i;
- spin_lock_irq(&cpu_rq(i)->lock);
- set_se_shares(tg->se[i], local_shares);
- spin_unlock_irq(&cpu_rq(i)->lock);
- balanced = 0;
- }
+ for_each_possible_cpu(i) {
+ if (tg->cfs_rq)
+ kfree(tg->cfs_rq[i]);
+ if (tg->se)
+ kfree(tg->se[i]);
}
- return balanced;
+ kfree(tg->cfs_rq);
+ kfree(tg->se);
}
-/*
- * How frequently should we rebalance_shares() across cpus?
- *
- * The more frequently we rebalance shares, the more accurate is the fairness
- * of cpu bandwidth distribution between task groups. However higher frequency
- * also implies increased scheduling overhead.
- *
- * sysctl_sched_min_bal_int_shares represents the minimum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * sysctl_sched_max_bal_int_shares represents the maximum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * These settings allows for the appropriate trade-off between accuracy of
- * fairness and the associated overhead.
- *
- */
-
-/* default: 8ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
-
-/* default: 128ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
-
-/* kernel thread that runs rebalance_shares() periodically */
-static int load_balance_monitor(void *unused)
+static int alloc_fair_sched_group(struct task_group *tg)
{
- unsigned int timeout = sysctl_sched_min_bal_int_shares;
- struct sched_param schedparm;
- int ret;
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se;
+ struct rq *rq;
+ int i;
- /*
- * We don't want this thread's execution to be limited by the shares
- * assigned to default group (init_task_group). Hence make it run
- * as a SCHED_RR RT task at the lowest priority.
- */
- schedparm.sched_priority = 1;
- ret = sched_setscheduler(current, SCHED_RR, &schedparm);
- if (ret)
- printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
- " monitor thread (error = %d) \n", ret);
+ tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
+ if (!tg->cfs_rq)
+ goto err;
+ tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
+ if (!tg->se)
+ goto err;
- while (!kthread_should_stop()) {
- int i, cpu, balanced = 1;
+ tg->shares = NICE_0_LOAD;
- /* Prevent cpus going down or coming up */
- get_online_cpus();
- /* lockout changes to doms_cur[] array */
- lock_doms_cur();
- /*
- * Enter a rcu read-side critical section to safely walk rq->sd
- * chain on various cpus and to walk task group list
- * (rq->leaf_cfs_rq_list) in rebalance_shares().
- */
- rcu_read_lock();
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
- for (i = 0; i < ndoms_cur; i++) {
- cpumask_t cpumap = doms_cur[i];
- struct sched_domain *sd = NULL, *sd_prev = NULL;
+ cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
+ GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ if (!cfs_rq)
+ goto err;
- cpu = first_cpu(cpumap);
+ se = kmalloc_node(sizeof(struct sched_entity),
+ GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+ if (!se)
+ goto err;
- /* Find the highest domain at which to balance shares */
- for_each_domain(cpu, sd) {
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
- sd_prev = sd;
- }
+ init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+ }
- sd = sd_prev;
- /* sd == NULL? No load balance reqd in this domain */
- if (!sd)
- continue;
+ return 1;
- balanced &= rebalance_shares(sd, cpu);
- }
+ err:
+ return 0;
+}
- rcu_read_unlock();
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+ list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
+ &cpu_rq(cpu)->leaf_cfs_rq_list);
+}
- unlock_doms_cur();
- put_online_cpus();
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+ list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+}
+#else
+static inline void free_fair_sched_group(struct task_group *tg)
+{
+}
- if (!balanced)
- timeout = sysctl_sched_min_bal_int_shares;
- else if (timeout < sysctl_sched_max_bal_int_shares)
- timeout *= 2;
+static inline int alloc_fair_sched_group(struct task_group *tg)
+{
+ return 1;
+}
- msleep_interruptible(timeout);
- }
+static inline void register_fair_sched_group(struct task_group *tg, int cpu)
+{
+}
- return 0;
+static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
}
-#endif /* CONFIG_SMP */
+#endif
-static void free_sched_group(struct task_group *tg)
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg)
{
int i;
for_each_possible_cpu(i) {
- if (tg->cfs_rq)
- kfree(tg->cfs_rq[i]);
- if (tg->se)
- kfree(tg->se[i]);
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
if (tg->rt_se)
kfree(tg->rt_se[i]);
}
- kfree(tg->cfs_rq);
- kfree(tg->se);
kfree(tg->rt_rq);
kfree(tg->rt_se);
- kfree(tg);
}
-/* allocate runqueue etc for a new task group */
-struct task_group *sched_create_group(void)
+static int alloc_rt_sched_group(struct task_group *tg)
{
- struct task_group *tg;
- struct cfs_rq *cfs_rq;
- struct sched_entity *se;
struct rt_rq *rt_rq;
struct sched_rt_entity *rt_se;
struct rq *rq;
int i;
- tg = kzalloc(sizeof(*tg), GFP_KERNEL);
- if (!tg)
- return ERR_PTR(-ENOMEM);
-
- tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
- if (!tg->cfs_rq)
- goto err;
- tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
- if (!tg->se)
- goto err;
tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
if (!tg->rt_rq)
goto err;
@@ -7590,22 +7534,11 @@ struct task_group *sched_create_group(void)
if (!tg->rt_se)
goto err;
- tg->shares = NICE_0_LOAD;
- tg->rt_ratio = 0; /* XXX */
+ tg->rt_runtime = 0;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
- cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
- GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
- if (!cfs_rq)
- goto err;
-
- se = kmalloc_node(sizeof(struct sched_entity),
- GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
- if (!se)
- goto err;
-
rt_rq = kmalloc_node(sizeof(struct rt_rq),
GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
if (!rt_rq)
@@ -7616,20 +7549,75 @@ struct task_group *sched_create_group(void)
if (!rt_se)
goto err;
- init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
}
- lock_task_group_list();
+ return 1;
+
+ err:
+ return 0;
+}
+
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+ list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
+ &cpu_rq(cpu)->leaf_rt_rq_list);
+}
+
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+ list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
+}
+#else
+static inline void free_rt_sched_group(struct task_group *tg)
+{
+}
+
+static inline int alloc_rt_sched_group(struct task_group *tg)
+{
+ return 1;
+}
+
+static inline void register_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+
+static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
+{
+}
+#endif
+
+static void free_sched_group(struct task_group *tg)
+{
+ free_fair_sched_group(tg);
+ free_rt_sched_group(tg);
+ kfree(tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(void)
+{
+ struct task_group *tg;
+ unsigned long flags;
+ int i;
+
+ tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ if (!alloc_fair_sched_group(tg))
+ goto err;
+
+ if (!alloc_rt_sched_group(tg))
+ goto err;
+
+ spin_lock_irqsave(&task_group_lock, flags);
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
- cfs_rq = tg->cfs_rq[i];
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
- rt_rq = tg->rt_rq[i];
- list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
+ register_fair_sched_group(tg, i);
+ register_rt_sched_group(tg, i);
}
list_add_rcu(&tg->list, &task_groups);
- unlock_task_group_list();
+ spin_unlock_irqrestore(&task_group_lock, flags);
return tg;
@@ -7648,21 +7636,16 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
- struct cfs_rq *cfs_rq = NULL;
- struct rt_rq *rt_rq = NULL;
+ unsigned long flags;
int i;
- lock_task_group_list();
+ spin_lock_irqsave(&task_group_lock, flags);
for_each_possible_cpu(i) {
- cfs_rq = tg->cfs_rq[i];
- list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
- rt_rq = tg->rt_rq[i];
- list_del_rcu(&rt_rq->leaf_rt_rq_list);
+ unregister_fair_sched_group(tg, i);
+ unregister_rt_sched_group(tg, i);
}
list_del_rcu(&tg->list);
- unlock_task_group_list();
-
- BUG_ON(!cfs_rq);
+ spin_unlock_irqrestore(&task_group_lock, flags);
/* wait for possible concurrent references to cfs_rqs complete */
call_rcu(&tg->rcu, free_sched_group_rcu);
@@ -7686,70 +7669,71 @@ void sched_move_task(struct task_struct *tsk)
running = task_current(rq, tsk);
on_rq = tsk->se.on_rq;
- if (on_rq) {
+ if (on_rq)
dequeue_task(rq, tsk, 0);
- if (unlikely(running))
- tsk->sched_class->put_prev_task(rq, tsk);
- }
+ if (unlikely(running))
+ tsk->sched_class->put_prev_task(rq, tsk);
set_task_rq(tsk, task_cpu(tsk));
- if (on_rq) {
- if (unlikely(running))
- tsk->sched_class->set_curr_task(rq);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (tsk->sched_class->moved_group)
+ tsk->sched_class->moved_group(tsk);
+#endif
+
+ if (unlikely(running))
+ tsk->sched_class->set_curr_task(rq);
+ if (on_rq)
enqueue_task(rq, tsk, 0);
- }
task_rq_unlock(rq, &flags);
}
-/* rq->lock to be locked by caller */
+#ifdef CONFIG_FAIR_GROUP_SCHED
static void set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
struct rq *rq = cfs_rq->rq;
int on_rq;
- if (!shares)
- shares = MIN_GROUP_SHARES;
+ spin_lock_irq(&rq->lock);
on_rq = se->on_rq;
- if (on_rq) {
+ if (on_rq)
dequeue_entity(cfs_rq, se, 0);
- dec_cpu_load(rq, se->load.weight);
- }
se->load.weight = shares;
se->load.inv_weight = div64_64((1ULL<<32), shares);
- if (on_rq) {
+ if (on_rq)
enqueue_entity(cfs_rq, se, 0);
- inc_cpu_load(rq, se->load.weight);
- }
+
+ spin_unlock_irq(&rq->lock);
}
+static DEFINE_MUTEX(shares_mutex);
+
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
- struct cfs_rq *cfs_rq;
- struct rq *rq;
+ unsigned long flags;
+
+ /*
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * (The default weight is 1024 - so there's no practical
+ * limitation from this.)
+ */
+ if (shares < 2)
+ shares = 2;
- lock_task_group_list();
+ mutex_lock(&shares_mutex);
if (tg->shares == shares)
goto done;
- if (shares < MIN_GROUP_SHARES)
- shares = MIN_GROUP_SHARES;
-
- /*
- * Prevent any load balance activity (rebalance_shares,
- * load_balance_fair) from referring to this group first,
- * by taking it off the rq->leaf_cfs_rq_list on each cpu.
- */
- for_each_possible_cpu(i) {
- cfs_rq = tg->cfs_rq[i];
- list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
- }
+ spin_lock_irqsave(&task_group_lock, flags);
+ for_each_possible_cpu(i)
+ unregister_fair_sched_group(tg, i);
+ spin_unlock_irqrestore(&task_group_lock, flags);
/* wait for any ongoing reference to this group to finish */
synchronize_sched();
@@ -7759,23 +7743,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* w/o tripping rebalance_share or load_balance_fair.
*/
tg->shares = shares;
- for_each_possible_cpu(i) {
- spin_lock_irq(&cpu_rq(i)->lock);
+ for_each_possible_cpu(i)
set_se_shares(tg->se[i], shares);
- spin_unlock_irq(&cpu_rq(i)->lock);
- }
/*
* Enable load balance activity on this group, by inserting it back on
* each cpu's rq->leaf_cfs_rq_list.
*/
- for_each_possible_cpu(i) {
- rq = cpu_rq(i);
- cfs_rq = tg->cfs_rq[i];
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
- }
+ spin_lock_irqsave(&task_group_lock, flags);
+ for_each_possible_cpu(i)
+ register_fair_sched_group(tg, i);
+ spin_unlock_irqrestore(&task_group_lock, flags);
done:
- unlock_task_group_list();
+ mutex_unlock(&shares_mutex);
return 0;
}
@@ -7783,35 +7763,97 @@ unsigned long sched_group_shares(struct task_group *tg)
{
return tg->shares;
}
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
/*
- * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ * Ensure that the real time constraints are schedulable.
*/
-int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+ if (runtime == RUNTIME_INF)
+ return 1ULL << 16;
+
+ return div64_64(runtime << 16, period);
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
struct task_group *tgi;
unsigned long total = 0;
+ unsigned long global_ratio =
+ to_ratio(sysctl_sched_rt_period,
+ sysctl_sched_rt_runtime < 0 ?
+ RUNTIME_INF : sysctl_sched_rt_runtime);
rcu_read_lock();
- list_for_each_entry_rcu(tgi, &task_groups, list)
- total += tgi->rt_ratio;
+ list_for_each_entry_rcu(tgi, &task_groups, list) {
+ if (tgi == tg)
+ continue;
+
+ total += to_ratio(period, tgi->rt_runtime);
+ }
rcu_read_unlock();
- if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
- return -EINVAL;
+ return total + to_ratio(period, runtime) < global_ratio;
+}
- tg->rt_ratio = rt_ratio;
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+ struct task_struct *g, *p;
+ do_each_thread(g, p) {
+ if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+ return 1;
+ } while_each_thread(g, p);
return 0;
}
-unsigned long sched_group_rt_ratio(struct task_group *tg)
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
- return tg->rt_ratio;
+ u64 rt_runtime, rt_period;
+ int err = 0;
+
+ rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+ rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+ if (rt_runtime_us == -1)
+ rt_runtime = RUNTIME_INF;
+
+ mutex_lock(&rt_constraints_mutex);
+ read_lock(&tasklist_lock);
+ if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) {
+ err = -EBUSY;
+ goto unlock;
+ }
+ if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
+ err = -EINVAL;
+ goto unlock;
+ }
+ tg->rt_runtime = rt_runtime;
+ unlock:
+ read_unlock(&tasklist_lock);
+ mutex_unlock(&rt_constraints_mutex);
+
+ return err;
}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+long sched_group_rt_runtime(struct task_group *tg)
+{
+ u64 rt_runtime_us;
+
+ if (tg->rt_runtime == RUNTIME_INF)
+ return -1;
+
+ rt_runtime_us = tg->rt_runtime;
+ do_div(rt_runtime_us, NSEC_PER_USEC);
+ return rt_runtime_us;
+}
+#endif
+#endif /* CONFIG_GROUP_SCHED */
-#ifdef CONFIG_FAIR_CGROUP_SCHED
+#ifdef CONFIG_CGROUP_SCHED
/* return corresponding task_group object of a cgroup */
static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
@@ -7857,9 +7899,15 @@ static int
cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
struct task_struct *tsk)
{
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* Don't accept realtime tasks when there is no way for them to run */
+ if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0)
+ return -EINVAL;
+#else
/* We don't support RT-tasks being in separate groups */
if (tsk->sched_class != &fair_sched_class)
return -EINVAL;
+#endif
return 0;
}
@@ -7871,6 +7919,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
sched_move_task(tsk);
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
{
@@ -7883,31 +7932,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
return (u64) tg->shares;
}
+#endif
-static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
- u64 rt_ratio_val)
+#ifdef CONFIG_RT_GROUP_SCHED
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ const char __user *userbuf,
+ size_t nbytes, loff_t *unused_ppos)
{
- return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+ char buffer[64];
+ int retval = 0;
+ s64 val;
+ char *end;
+
+ if (!nbytes)
+ return -EINVAL;
+ if (nbytes >= sizeof(buffer))
+ return -E2BIG;
+ if (copy_from_user(buffer, userbuf, nbytes))
+ return -EFAULT;
+
+ buffer[nbytes] = 0; /* nul-terminate */
+
+ /* strip newline if necessary */
+ if (nbytes && (buffer[nbytes-1] == '\n'))
+ buffer[nbytes-1] = 0;
+ val = simple_strtoll(buffer, &end, 0);
+ if (*end)
+ return -EINVAL;
+
+ /* Pass to subsystem */
+ retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+ if (!retval)
+ retval = nbytes;
+ return retval;
}
-static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
+ struct file *file,
+ char __user *buf, size_t nbytes,
+ loff_t *ppos)
{
- struct task_group *tg = cgroup_tg(cgrp);
+ char tmp[64];
+ long val = sched_group_rt_runtime(cgroup_tg(cgrp));
+ int len = sprintf(tmp, "%ld\n", val);
- return (u64) tg->rt_ratio;
+ return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
+#endif
static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
.read_uint = cpu_shares_read_uint,
.write_uint = cpu_shares_write_uint,
},
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
{
- .name = "rt_ratio",
- .read_uint = cpu_rt_ratio_read_uint,
- .write_uint = cpu_rt_ratio_write_uint,
+ .name = "rt_runtime_us",
+ .read = cpu_rt_runtime_read,
+ .write = cpu_rt_runtime_write,
},
+#endif
};
static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -7926,7 +8014,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.early_init = 1,
};
-#endif /* CONFIG_FAIR_CGROUP_SCHED */
+#endif /* CONFIG_CGROUP_SCHED */
#ifdef CONFIG_CGROUP_CPUACCT
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4b5e24cf2f4a..ef358ba07683 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.exec_start);
PN(se.vruntime);
PN(se.sum_exec_runtime);
+ PN(se.avg_overlap);
nr_switches = p->nvcsw + p->nivcsw;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6c091d6e159d..0080968d3e4a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
/*
* SCHED_OTHER wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
-unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
- if (leftmost)
+ if (leftmost) {
cfs_rq->rb_leftmost = &se->run_node;
+ /*
+ * maintain cfs_rq->min_vruntime to be a monotonic increasing
+ * value tracking the leftmost vruntime in the tree.
+ */
+ cfs_rq->min_vruntime =
+ max_vruntime(cfs_rq->min_vruntime, se->vruntime);
+ }
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -184,8 +191,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->rb_leftmost == &se->run_node)
- cfs_rq->rb_leftmost = rb_next(&se->run_node);
+ if (cfs_rq->rb_leftmost == &se->run_node) {
+ struct rb_node *next_node;
+ struct sched_entity *next;
+
+ next_node = rb_next(&se->run_node);
+ cfs_rq->rb_leftmost = next_node;
+
+ if (next_node) {
+ next = rb_entry(next_node,
+ struct sched_entity, run_node);
+ cfs_rq->min_vruntime =
+ max_vruntime(cfs_rq->min_vruntime,
+ next->vruntime);
+ }
+ }
+
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
@@ -202,17 +225,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
- struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
- struct sched_entity *se = NULL;
- struct rb_node *parent;
+ struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
- while (*link) {
- parent = *link;
- se = rb_entry(parent, struct sched_entity, run_node);
- link = &parent->rb_right;
- }
+ if (!last)
+ return NULL;
- return se;
+ return rb_entry(last, struct sched_entity, run_node);
}
/**************************************************************
@@ -265,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running)
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 slice = __sched_period(cfs_rq->nr_running);
-
- slice *= se->load.weight;
- do_div(slice, cfs_rq->load.weight);
-
- return slice;
+ return calc_delta_mine(__sched_period(cfs_rq->nr_running),
+ se->load.weight, &cfs_rq->load);
}
/*
@@ -288,11 +302,6 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
return vslice;
}
-static u64 sched_vslice(struct cfs_rq *cfs_rq)
-{
- return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running);
-}
-
static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return __sched_vslice(cfs_rq->load.weight + se->load.weight,
@@ -308,7 +317,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
unsigned long delta_exec)
{
unsigned long delta_exec_weighted;
- u64 vruntime;
schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
@@ -320,19 +328,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
&curr->load);
}
curr->vruntime += delta_exec_weighted;
-
- /*
- * maintain cfs_rq->min_vruntime to be a monotonic increasing
- * value tracking the leftmost vruntime in the tree.
- */
- if (first_fair(cfs_rq)) {
- vruntime = min_vruntime(curr->vruntime,
- __pick_next_entity(cfs_rq)->vruntime);
- } else
- vruntime = curr->vruntime;
-
- cfs_rq->min_vruntime =
- max_vruntime(cfs_rq->min_vruntime, vruntime);
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -498,16 +493,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime;
- vruntime = cfs_rq->min_vruntime;
-
- if (sched_feat(TREE_AVG)) {
- struct sched_entity *last = __pick_last_entity(cfs_rq);
- if (last) {
- vruntime += last->vruntime;
- vruntime >>= 1;
- }
- } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
- vruntime += sched_vslice(cfs_rq)/2;
+ if (first_fair(cfs_rq)) {
+ vruntime = min_vruntime(cfs_rq->min_vruntime,
+ __pick_next_entity(cfs_rq)->vruntime);
+ } else
+ vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
@@ -550,6 +540,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
account_entity_enqueue(cfs_rq, se);
}
+static void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff >> 3;
+}
+
+static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (!se->last_wakeup)
+ return;
+
+ update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
+ se->last_wakeup = 0;
+}
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
@@ -560,6 +565,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
update_stats_dequeue(cfs_rq, se);
if (sleep) {
+ update_avg_stats(cfs_rq, se);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
@@ -621,12 +627,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
+static struct sched_entity *
+pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 diff, gran;
+
+ if (!cfs_rq->next)
+ return se;
+
+ diff = cfs_rq->next->vruntime - se->vruntime;
+ if (diff < 0)
+ return se;
+
+ gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
+ if (diff > gran)
+ return se;
+
+ return cfs_rq->next;
+}
+
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = NULL;
if (first_fair(cfs_rq)) {
se = __pick_next_entity(cfs_rq);
+ se = pick_next(cfs_rq, se);
set_next_entity(cfs_rq, se);
}
@@ -732,8 +758,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
-#define GROUP_IMBALANCE_PCT 20
-
#else /* CONFIG_FAIR_GROUP_SCHED */
#define for_each_sched_entity(se) \
@@ -824,26 +848,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{
struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se,
- *topse = NULL; /* Highest schedulable entity */
- int incload = 1;
+ struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
- topse = se;
- if (se->on_rq) {
- incload = 0;
+ if (se->on_rq)
break;
- }
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, wakeup);
wakeup = 1;
}
- /* Increment cpu load if we just enqueued the first task of a group on
- * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
- * at the highest grouping level.
- */
- if (incload)
- inc_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr);
}
@@ -856,28 +869,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
{
struct cfs_rq *cfs_rq;
- struct sched_entity *se = &p->se,
- *topse = NULL; /* Highest schedulable entity */
- int decload = 1;
+ struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
- topse = se;
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, sleep);
/* Don't dequeue parent if it has other entities besides us */
- if (cfs_rq->load.weight) {
- if (parent_entity(se))
- decload = 0;
+ if (cfs_rq->load.weight)
break;
- }
sleep = 1;
}
- /* Decrement cpu load if we just dequeued the last task of a group on
- * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
- * at the highest grouping level.
- */
- if (decload)
- dec_cpu_load(rq, topse->load.weight);
hrtick_start_fair(rq, rq->curr);
}
@@ -979,96 +980,121 @@ static inline int wake_idle(int cpu, struct task_struct *p)
#endif
#ifdef CONFIG_SMP
-static int select_task_rq_fair(struct task_struct *p, int sync)
+
+static const struct sched_class fair_sched_class;
+
+static int
+wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
+ struct task_struct *p, int prev_cpu, int this_cpu, int sync,
+ int idx, unsigned long load, unsigned long this_load,
+ unsigned int imbalance)
{
- int cpu, this_cpu;
- struct rq *rq;
- struct sched_domain *sd, *this_sd = NULL;
- int new_cpu;
+ struct task_struct *curr = this_rq->curr;
+ unsigned long tl = this_load;
+ unsigned long tl_per_task;
+
+ if (!(this_sd->flags & SD_WAKE_AFFINE))
+ return 0;
+
+ /*
+ * If the currently running task will sleep within
+ * a reasonable amount of time then attract this newly
+ * woken task:
+ */
+ if (sync && curr->sched_class == &fair_sched_class) {
+ if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+ p->se.avg_overlap < sysctl_sched_migration_cost)
+ return 1;
+ }
- cpu = task_cpu(p);
- rq = task_rq(p);
- this_cpu = smp_processor_id();
- new_cpu = cpu;
+ schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ tl_per_task = cpu_avg_load_per_task(this_cpu);
- if (cpu == this_cpu)
- goto out_set_cpu;
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync)
+ tl -= current->se.load.weight;
+
+ if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
+ 100*(tl + p->se.load.weight) <= imbalance*load) {
+ /*
+ * This domain has SD_WAKE_AFFINE and
+ * p is cache cold in this domain, and
+ * there is no bad imbalance.
+ */
+ schedstat_inc(this_sd, ttwu_move_affine);
+ schedstat_inc(p, se.nr_wakeups_affine);
+ return 1;
+ }
+ return 0;
+}
+
+static int select_task_rq_fair(struct task_struct *p, int sync)
+{
+ struct sched_domain *sd, *this_sd = NULL;
+ int prev_cpu, this_cpu, new_cpu;
+ unsigned long load, this_load;
+ struct rq *rq, *this_rq;
+ unsigned int imbalance;
+ int idx;
+
+ prev_cpu = task_cpu(p);
+ rq = task_rq(p);
+ this_cpu = smp_processor_id();
+ this_rq = cpu_rq(this_cpu);
+ new_cpu = prev_cpu;
+
+ /*
+ * 'this_sd' is the first domain that both
+ * this_cpu and prev_cpu are present in:
+ */
for_each_domain(this_cpu, sd) {
- if (cpu_isset(cpu, sd->span)) {
+ if (cpu_isset(prev_cpu, sd->span)) {
this_sd = sd;
break;
}
}
if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
- goto out_set_cpu;
+ goto out;
/*
* Check for affine wakeup and passive balancing possibilities.
*/
- if (this_sd) {
- int idx = this_sd->wake_idx;
- unsigned int imbalance;
- unsigned long load, this_load;
-
- imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-
- load = source_load(cpu, idx);
- this_load = target_load(this_cpu, idx);
-
- new_cpu = this_cpu; /* Wake to this CPU if we can */
-
- if (this_sd->flags & SD_WAKE_AFFINE) {
- unsigned long tl = this_load;
- unsigned long tl_per_task;
-
- /*
- * Attract cache-cold tasks on sync wakeups:
- */
- if (sync && !task_hot(p, rq->clock, this_sd))
- goto out_set_cpu;
-
- schedstat_inc(p, se.nr_wakeups_affine_attempts);
- tl_per_task = cpu_avg_load_per_task(this_cpu);
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync)
- tl -= current->se.load.weight;
-
- if ((tl <= load &&
- tl + target_load(cpu, idx) <= tl_per_task) ||
- 100*(tl + p->se.load.weight) <= imbalance*load) {
- /*
- * This domain has SD_WAKE_AFFINE and
- * p is cache cold in this domain, and
- * there is no bad imbalance.
- */
- schedstat_inc(this_sd, ttwu_move_affine);
- schedstat_inc(p, se.nr_wakeups_affine);
- goto out_set_cpu;
- }
- }
+ if (!this_sd)
+ goto out;
- /*
- * Start passive balancing when half the imbalance_pct
- * limit is reached.
- */
- if (this_sd->flags & SD_WAKE_BALANCE) {
- if (imbalance*this_load <= 100*load) {
- schedstat_inc(this_sd, ttwu_move_balance);
- schedstat_inc(p, se.nr_wakeups_passive);
- goto out_set_cpu;
- }
+ idx = this_sd->wake_idx;
+
+ imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
+ load = source_load(prev_cpu, idx);
+ this_load = target_load(this_cpu, idx);
+
+ if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+ load, this_load, imbalance))
+ return this_cpu;
+
+ if (prev_cpu == this_cpu)
+ goto out;
+
+ /*
+ * Start passive balancing when half the imbalance_pct
+ * limit is reached.
+ */
+ if (this_sd->flags & SD_WAKE_BALANCE) {
+ if (imbalance*this_load <= 100*load) {
+ schedstat_inc(this_sd, ttwu_move_balance);
+ schedstat_inc(p, se.nr_wakeups_passive);
+ return this_cpu;
}
}
- new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
-out_set_cpu:
+out:
return wake_idle(new_cpu, p);
}
#endif /* CONFIG_SMP */
@@ -1090,6 +1116,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
resched_task(curr);
return;
}
+
+ se->last_wakeup = se->sum_exec_runtime;
+ if (unlikely(se == pse))
+ return;
+
+ cfs_rq_of(pse)->next = pse;
+
/*
* Batch tasks do not preempt (their preemption is driven by
* the tick):
@@ -1191,6 +1224,25 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *curr;
+ struct task_struct *p;
+
+ if (!cfs_rq->nr_running || !first_fair(cfs_rq))
+ return MAX_PRIO;
+
+ curr = cfs_rq->curr;
+ if (!curr)
+ curr = __pick_next_entity(cfs_rq);
+
+ p = task_of(curr);
+
+ return p->prio;
+}
+#endif
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@@ -1200,45 +1252,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move;
struct rq_iterator cfs_rq_iterator;
- unsigned long load_moved;
cfs_rq_iterator.start = load_balance_start_fair;
cfs_rq_iterator.next = load_balance_next_fair;
for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
#ifdef CONFIG_FAIR_GROUP_SCHED
- struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
- unsigned long maxload, task_load, group_weight;
- unsigned long thisload, per_task_load;
- struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
-
- task_load = busy_cfs_rq->load.weight;
- group_weight = se->load.weight;
+ struct cfs_rq *this_cfs_rq;
+ long imbalance;
+ unsigned long maxload;
- /*
- * 'group_weight' is contributed by tasks of total weight
- * 'task_load'. To move 'rem_load_move' worth of weight only,
- * we need to move a maximum task load of:
- *
- * maxload = (remload / group_weight) * task_load;
- */
- maxload = (rem_load_move * task_load) / group_weight;
+ this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
- if (!maxload || !task_load)
+ imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+ /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+ if (imbalance <= 0)
continue;
- per_task_load = task_load / busy_cfs_rq->nr_running;
- /*
- * balance_tasks will try to forcibly move atleast one task if
- * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
- * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
- */
- if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
- continue;
+ /* Don't pull more than imbalance/2 */
+ imbalance /= 2;
+ maxload = min(rem_load_move, imbalance);
- /* Disable priority-based load balance */
- *this_best_prio = 0;
- thisload = this_cfs_rq->load.weight;
+ *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
#else
# define maxload rem_load_move
#endif
@@ -1247,33 +1282,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
* load_balance_[start|next]_fair iterators
*/
cfs_rq_iterator.arg = busy_cfs_rq;
- load_moved = balance_tasks(this_rq, this_cpu, busiest,
+ rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
maxload, sd, idle, all_pinned,
this_best_prio,
&cfs_rq_iterator);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * load_moved holds the task load that was moved. The
- * effective (group) weight moved would be:
- * load_moved_eff = load_moved/task_load * group_weight;
- */
- load_moved = (group_weight * load_moved) / task_load;
-
- /* Adjust shares on both cpus to reflect load_moved */
- group_weight -= load_moved;
- set_se_shares(se, group_weight);
-
- se = busy_cfs_rq->tg->se[this_cpu];
- if (!thisload)
- group_weight = load_moved;
- else
- group_weight = se->load.weight + load_moved;
- set_se_shares(se, group_weight);
-#endif
-
- rem_load_move -= load_moved;
-
if (rem_load_move <= 0)
break;
}
@@ -1403,6 +1416,16 @@ static void set_curr_task_fair(struct rq *rq)
set_next_entity(cfs_rq_of(se), se);
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void moved_group_fair(struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+ update_curr(cfs_rq);
+ place_entity(cfs_rq, &p->se, 1);
+}
+#endif
+
/*
* All the scheduling class methods:
*/
@@ -1431,6 +1454,10 @@ static const struct sched_class fair_sched_class = {
.prio_changed = prio_changed_fair,
.switched_to = switched_to_fair,
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ .moved_group = moved_group_fair,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 274b40d7bef2..0a6d2e516420 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -55,14 +55,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
return !list_empty(&rt_se->run_list);
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
if (!rt_rq->tg)
- return SCHED_RT_FRAC;
+ return RUNTIME_INF;
- return rt_rq->tg->rt_ratio;
+ return rt_rq->tg->rt_runtime;
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
-static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
struct sched_rt_entity *rt_se = rt_rq->rt_se;
@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
}
}
-static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
struct sched_rt_entity *rt_se = rt_rq->rt_se;
@@ -110,11 +110,31 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
dequeue_rt_entity(rt_se);
}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = group_rt_rq(rt_se);
+ struct task_struct *p;
+
+ if (rt_rq)
+ return !!rt_rq->rt_nr_boosted;
+
+ p = rt_task_of(rt_se);
+ return p->prio != p->normal_prio;
+}
+
#else
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
- return sysctl_sched_rt_ratio;
+ if (sysctl_sched_rt_runtime == -1)
+ return RUNTIME_INF;
+
+ return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -141,19 +161,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
return NULL;
}
-static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
}
-static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
}
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled;
+}
#endif
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_RT_GROUP_SCHED
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq)
@@ -163,28 +187,26 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
return rt_task_of(rt_se)->prio;
}
-static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
- unsigned int rt_ratio = sched_rt_ratio(rt_rq);
- u64 period, ratio;
+ u64 runtime = sched_rt_runtime(rt_rq);
- if (rt_ratio == SCHED_RT_FRAC)
+ if (runtime == RUNTIME_INF)
return 0;
if (rt_rq->rt_throttled)
- return 1;
-
- period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
- ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+ return rt_rq_throttled(rt_rq);
- if (rt_rq->rt_time > ratio) {
+ if (rt_rq->rt_time > runtime) {
struct rq *rq = rq_of_rt_rq(rt_rq);
rq->rt_throttled = 1;
rt_rq->rt_throttled = 1;
- sched_rt_ratio_dequeue(rt_rq);
- return 1;
+ if (rt_rq_throttled(rt_rq)) {
+ sched_rt_rq_dequeue(rt_rq);
+ return 1;
+ }
}
return 0;
@@ -196,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq)
u64 period;
while (rq->clock > rq->rt_period_expire) {
- period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+ period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
rq->rt_period_expire += period;
for_each_leaf_rt_rq(rt_rq, rq) {
- unsigned long rt_ratio = sched_rt_ratio(rt_rq);
- u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+ u64 runtime = sched_rt_runtime(rt_rq);
- rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
- if (rt_rq->rt_throttled) {
+ rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
+ if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
rt_rq->rt_throttled = 0;
- sched_rt_ratio_enqueue(rt_rq);
+ sched_rt_rq_enqueue(rt_rq);
}
}
@@ -239,12 +260,7 @@ static void update_curr_rt(struct rq *rq)
cpuacct_charge(curr, delta_exec);
rt_rq->rt_time += delta_exec;
- /*
- * might make it a tad more accurate:
- *
- * update_sched_rt_period(rq);
- */
- if (sched_rt_ratio_exceeded(rt_rq))
+ if (sched_rt_runtime_exceeded(rt_rq))
resched_task(curr);
}
@@ -253,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
if (rt_se_prio(rt_se) < rt_rq->highest_prio)
rt_rq->highest_prio = rt_se_prio(rt_se);
#endif
@@ -265,6 +281,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted++;
+#endif
}
static inline
@@ -273,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running--;
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
if (rt_rq->rt_nr_running) {
struct rt_prio_array *array;
@@ -295,6 +315,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif /* CONFIG_SMP */
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (rt_se_boosted(rt_se))
+ rt_rq->rt_nr_boosted--;
+
+ WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
+#endif
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +329,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
struct rt_prio_array *array = &rt_rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
- if (group_rq && group_rq->rt_throttled)
+ if (group_rq && rt_rq_throttled(group_rq))
return;
list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
@@ -367,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
*/
for_each_sched_rt_entity(rt_se)
enqueue_rt_entity(rt_se);
-
- inc_cpu_load(rq, p->se.load.weight);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -388,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
if (rt_rq && rt_rq->rt_nr_running)
enqueue_rt_entity(rt_se);
}
-
- dec_cpu_load(rq, p->se.load.weight);
}
/*
@@ -496,7 +518,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
if (unlikely(!rt_rq->rt_nr_running))
return NULL;
- if (sched_rt_ratio_exceeded(rt_rq))
+ if (rt_rq_throttled(rt_rq))
return NULL;
do {
@@ -1085,9 +1107,11 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
pull_rt_task(rq);
/*
* If there's a higher priority task waiting to run
- * then reschedule.
+ * then reschedule. Note, the above pull_rt_task
+ * can release the rq lock and p could migrate.
+ * Only reschedule if p is still on the same runqueue.
*/
- if (p->prio > rq->rt.highest_prio)
+ if (p->prio > rq->rt.highest_prio && rq->curr == p)
resched_task(p);
#else
/* For UP simply resched on drop of prio */
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
new file mode 100644
index 000000000000..5c2942e768cd
--- /dev/null
+++ b/kernel/semaphore.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2008 Intel Corporation
+ * Author: Matthew Wilcox <willy@linux.intel.com>
+ *
+ * Distributed under the terms of the GNU GPL, version 2
+ *
+ * This file implements counting semaphores.
+ * A counting semaphore may be acquired 'n' times before sleeping.
+ * See mutex.c for single-acquisition sleeping locks which enforce
+ * rules which allow code to be debugged more easily.
+ */
+
+/*
+ * Some notes on the implementation:
+ *
+ * The spinlock controls access to the other members of the semaphore.
+ * down_trylock() and up() can be called from interrupt context, so we
+ * have to disable interrupts when taking the lock. It turns out various
+ * parts of the kernel expect to be able to use down() on a semaphore in
+ * interrupt context when they know it will succeed, so we have to use
+ * irqsave variants for down(), down_interruptible() and down_killable()
+ * too.
+ *
+ * The ->count variable represents how many more tasks can acquire this
+ * semaphore. If it's zero, there may be tasks waiting on the wait_list.
+ */
+
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/spinlock.h>
+
+static noinline void __down(struct semaphore *sem);
+static noinline int __down_interruptible(struct semaphore *sem);
+static noinline int __down_killable(struct semaphore *sem);
+static noinline int __down_timeout(struct semaphore *sem, long jiffies);
+static noinline void __up(struct semaphore *sem);
+
+/**
+ * down - acquire the semaphore
+ * @sem: the semaphore to be acquired
+ *
+ * Acquires the semaphore. If no more tasks are allowed to acquire the
+ * semaphore, calling this function will put the task to sleep until the
+ * semaphore is released.
+ *
+ * Use of this function is deprecated, please use down_interruptible() or
+ * down_killable() instead.
+ */
+void down(struct semaphore *sem)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ __down(sem);
+ spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(down);
+
+/**
+ * down_interruptible - acquire the semaphore unless interrupted
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a signal, this function will return -EINTR.
+ * If the semaphore is successfully acquired, this function returns 0.
+ */
+int down_interruptible(struct semaphore *sem)
+{
+ unsigned long flags;
+ int result = 0;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_interruptible(sem);
+ spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_interruptible);
+
+/**
+ * down_killable - acquire the semaphore unless killed
+ * @sem: the semaphore to be acquired
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the sleep is interrupted by a fatal signal, this function will return
+ * -EINTR. If the semaphore is successfully acquired, this function returns
+ * 0.
+ */
+int down_killable(struct semaphore *sem)
+{
+ unsigned long flags;
+ int result = 0;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_killable(sem);
+ spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_killable);
+
+/**
+ * down_trylock - try to acquire the semaphore, without waiting
+ * @sem: the semaphore to be acquired
+ *
+ * Try to acquire the semaphore atomically. Returns 0 if the mutex has
+ * been acquired successfully or 1 if it it cannot be acquired.
+ *
+ * NOTE: This return value is inverted from both spin_trylock and
+ * mutex_trylock! Be careful about this when converting code.
+ *
+ * Unlike mutex_trylock, this function can be used from interrupt context,
+ * and the semaphore can be released by any task or interrupt.
+ */
+int down_trylock(struct semaphore *sem)
+{
+ unsigned long flags;
+ int count;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ count = sem->count - 1;
+ if (likely(count >= 0))
+ sem->count = count;
+ spin_unlock_irqrestore(&sem->lock, flags);
+
+ return (count < 0);
+}
+EXPORT_SYMBOL(down_trylock);
+
+/**
+ * down_timeout - acquire the semaphore within a specified time
+ * @sem: the semaphore to be acquired
+ * @jiffies: how long to wait before failing
+ *
+ * Attempts to acquire the semaphore. If no more tasks are allowed to
+ * acquire the semaphore, calling this function will put the task to sleep.
+ * If the semaphore is not released within the specified number of jiffies,
+ * this function returns -ETIME. It returns 0 if the semaphore was acquired.
+ */
+int down_timeout(struct semaphore *sem, long jiffies)
+{
+ unsigned long flags;
+ int result = 0;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ if (likely(sem->count > 0))
+ sem->count--;
+ else
+ result = __down_timeout(sem, jiffies);
+ spin_unlock_irqrestore(&sem->lock, flags);
+
+ return result;
+}
+EXPORT_SYMBOL(down_timeout);
+
+/**
+ * up - release the semaphore
+ * @sem: the semaphore to release
+ *
+ * Release the semaphore. Unlike mutexes, up() may be called from any
+ * context and even by tasks which have never called down().
+ */
+void up(struct semaphore *sem)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&sem->lock, flags);
+ if (likely(list_empty(&sem->wait_list)))
+ sem->count++;
+ else
+ __up(sem);
+ spin_unlock_irqrestore(&sem->lock, flags);
+}
+EXPORT_SYMBOL(up);
+
+/* Functions for the contended case */
+
+struct semaphore_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ int up;
+};
+
+/*
+ * Because this function is inlined, the 'state' parameter will be
+ * constant, and thus optimised away by the compiler. Likewise the
+ * 'timeout' parameter for the cases without timeouts.
+ */
+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ struct task_struct *task = current;
+ struct semaphore_waiter waiter;
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+ waiter.task = task;
+ waiter.up = 0;
+
+ for (;;) {
+ if (state == TASK_INTERRUPTIBLE && signal_pending(task))
+ goto interrupted;
+ if (state == TASK_KILLABLE && fatal_signal_pending(task))
+ goto interrupted;
+ if (timeout <= 0)
+ goto timed_out;
+ __set_task_state(task, state);
+ spin_unlock_irq(&sem->lock);
+ timeout = schedule_timeout(timeout);
+ spin_lock_irq(&sem->lock);
+ if (waiter.up)
+ return 0;
+ }
+
+ timed_out:
+ list_del(&waiter.list);
+ return -ETIME;
+
+ interrupted:
+ list_del(&waiter.list);
+ return -EINTR;
+}
+
+static noinline void __sched __down(struct semaphore *sem)
+{
+ __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_interruptible(struct semaphore *sem)
+{
+ return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_killable(struct semaphore *sem)
+{
+ return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
+}
+
+static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies)
+{
+ return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies);
+}
+
+static noinline void __sched __up(struct semaphore *sem)
+{
+ struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
+ struct semaphore_waiter, list);
+ list_del(&waiter->list);
+ waiter->up = 1;
+ wake_up_process(waiter->task);
+}
diff --git a/kernel/signal.c b/kernel/signal.c
index 2c1f08defac2..cc8303cd093d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -972,7 +972,7 @@ void zap_other_threads(struct task_struct *p)
}
}
-int fastcall __fatal_signal_pending(struct task_struct *tsk)
+int __fatal_signal_pending(struct task_struct *tsk)
{
return sigismember(&tsk->pending.signal, SIGKILL);
}
@@ -1623,7 +1623,6 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
/* Let the debugger run. */
__set_current_state(TASK_TRACED);
spin_unlock_irq(&current->sighand->siglock);
- try_to_freeze();
read_lock(&tasklist_lock);
if (!unlikely(killed) && may_ptrace_stop()) {
do_notify_parent_cldstop(current, CLD_TRAPPED);
@@ -1641,6 +1640,13 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
}
/*
+ * While in TASK_TRACED, we were considered "frozen enough".
+ * Now that we woke up, it's crucial if we're supposed to be
+ * frozen that we freeze now before running anything substantial.
+ */
+ try_to_freeze();
+
+ /*
* We are back. Now reacquire the siglock before touching
* last_siginfo, so that we are sure to have synchronized with
* any signal-sending on another CPU that wants to examine it.
@@ -1751,15 +1757,60 @@ static int do_signal_stop(int signr)
return 1;
}
+static int ptrace_signal(int signr, siginfo_t *info,
+ struct pt_regs *regs, void *cookie)
+{
+ if (!(current->ptrace & PT_PTRACED))
+ return signr;
+
+ ptrace_signal_deliver(regs, cookie);
+
+ /* Let the debugger run. */
+ ptrace_stop(signr, 0, info);
+
+ /* We're back. Did the debugger cancel the sig? */
+ signr = current->exit_code;
+ if (signr == 0)
+ return signr;
+
+ current->exit_code = 0;
+
+ /* Update the siginfo structure if the signal has
+ changed. If the debugger wanted something
+ specific in the siginfo structure then it should
+ have updated *info via PTRACE_SETSIGINFO. */
+ if (signr != info->si_signo) {
+ info->si_signo = signr;
+ info->si_errno = 0;
+ info->si_code = SI_USER;
+ info->si_pid = task_pid_vnr(current->parent);
+ info->si_uid = current->parent->uid;
+ }
+
+ /* If the (new) signal is now blocked, requeue it. */
+ if (sigismember(&current->blocked, signr)) {
+ specific_send_sig_info(signr, info, current);
+ signr = 0;
+ }
+
+ return signr;
+}
+
int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
struct pt_regs *regs, void *cookie)
{
sigset_t *mask = &current->blocked;
int signr = 0;
+relock:
+ /*
+ * We'll jump back here after any time we were stopped in TASK_STOPPED.
+ * While in TASK_STOPPED, we were considered "frozen enough".
+ * Now that we woke up, it's crucial if we're supposed to be
+ * frozen that we freeze now before running anything substantial.
+ */
try_to_freeze();
-relock:
spin_lock_irq(&current->sighand->siglock);
for (;;) {
struct k_sigaction *ka;
@@ -1773,36 +1824,10 @@ relock:
if (!signr)
break; /* will return 0 */
- if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) {
- ptrace_signal_deliver(regs, cookie);
-
- /* Let the debugger run. */
- ptrace_stop(signr, 0, info);
-
- /* We're back. Did the debugger cancel the sig? */
- signr = current->exit_code;
- if (signr == 0)
- continue;
-
- current->exit_code = 0;
-
- /* Update the siginfo structure if the signal has
- changed. If the debugger wanted something
- specific in the siginfo structure then it should
- have updated *info via PTRACE_SETSIGINFO. */
- if (signr != info->si_signo) {
- info->si_signo = signr;
- info->si_errno = 0;
- info->si_code = SI_USER;
- info->si_pid = task_pid_vnr(current->parent);
- info->si_uid = current->parent->uid;
- }
-
- /* If the (new) signal is now blocked, requeue it. */
- if (sigismember(&current->blocked, signr)) {
- specific_send_sig_info(signr, info, current);
+ if (signr != SIGKILL) {
+ signr = ptrace_signal(signr, info, regs, cookie);
+ if (!signr)
continue;
- }
}
ka = &current->sighand->action[signr-1];
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5b3aea5f471e..31e9f2a47928 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -313,6 +313,7 @@ void irq_exit(void)
/* Make sure that timer wheel updates are propagated */
if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
tick_nohz_stop_sched_tick();
+ rcu_irq_exit();
#endif
preempt_enable_no_resched();
}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7c2da88db4ed..01b6522fd92b 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -216,26 +216,27 @@ static int watchdog(void *__bind_cpu)
/* initialize timestamp */
touch_softlockup_watchdog();
+ set_current_state(TASK_INTERRUPTIBLE);
/*
* Run briefly once per second to reset the softlockup timestamp.
* If this gets delayed for more than 60 seconds then the
* debug-printout triggers in softlockup_tick().
*/
while (!kthread_should_stop()) {
- set_current_state(TASK_INTERRUPTIBLE);
touch_softlockup_watchdog();
schedule();
if (kthread_should_stop())
break;
- if (this_cpu != check_cpu)
- continue;
-
- if (sysctl_hung_task_timeout_secs)
- check_hung_uninterruptible_tasks(this_cpu);
+ if (this_cpu == check_cpu) {
+ if (sysctl_hung_task_timeout_secs)
+ check_hung_uninterruptible_tasks(this_cpu);
+ }
+ set_current_state(TASK_INTERRUPTIBLE);
}
+ __set_current_state(TASK_RUNNING);
return 0;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d41ef6b4cf72..b2a2d6889bab 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -311,9 +311,10 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#endif
{
.ctl_name = CTL_UNNUMBERED,
- .procname = "sched_rt_period_ms",
+ .procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
.maxlen = sizeof(unsigned int),
.mode = 0644,
@@ -321,31 +322,12 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
- .procname = "sched_rt_ratio",
- .data = &sysctl_sched_rt_ratio,
- .maxlen = sizeof(unsigned int),
+ .procname = "sched_rt_runtime_us",
+ .data = &sysctl_sched_rt_runtime,
+ .maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
- {
- .ctl_name = CTL_UNNUMBERED,
- .procname = "sched_min_bal_int_shares",
- .data = &sysctl_sched_min_bal_int_shares,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
- {
- .ctl_name = CTL_UNNUMBERED,
- .procname = "sched_max_bal_int_shares",
- .data = &sysctl_sched_max_bal_int_shares,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = &proc_dointvec,
- },
-#endif
-#endif
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_compat_yield",
@@ -978,8 +960,8 @@ static struct ctl_table vm_table[] = {
{
.ctl_name = CTL_UNNUMBERED,
.procname = "nr_overcommit_hugepages",
- .data = &nr_overcommit_huge_pages,
- .maxlen = sizeof(nr_overcommit_huge_pages),
+ .data = &sysctl_overcommit_huge_pages,
+ .maxlen = sizeof(sysctl_overcommit_huge_pages),
.mode = 0644,
.proc_handler = &hugetlb_overcommit_handler,
},
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 548c436a776b..f61402b1f2d0 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -141,13 +141,8 @@ static void clocksource_watchdog(unsigned long data)
}
if (!list_empty(&watchdog_list)) {
- /* Cycle through CPUs to check if the CPUs stay synchronized to
- * each other. */
- int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
- if (next_cpu >= NR_CPUS)
- next_cpu = first_cpu(cpu_online_map);
- watchdog_timer.expires += WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, next_cpu);
+ __mod_timer(&watchdog_timer,
+ watchdog_timer.expires + WATCHDOG_INTERVAL);
}
spin_unlock(&watchdog_lock);
}
@@ -169,7 +164,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
if (!started && watchdog) {
watchdog_last = watchdog->read();
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, first_cpu(cpu_online_map));
+ add_timer(&watchdog_timer);
}
} else {
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -179,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
if (watchdog)
del_timer(&watchdog_timer);
watchdog = cs;
- init_timer_deferrable(&watchdog_timer);
+ init_timer(&watchdog_timer);
watchdog_timer.function = clocksource_watchdog;
/* Reset watchdog cycles */
@@ -190,8 +185,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
watchdog_last = watchdog->read();
watchdog_timer.expires =
jiffies + WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer,
- first_cpu(cpu_online_map));
+ add_timer(&watchdog_timer);
}
}
}
@@ -228,6 +222,18 @@ void clocksource_resume(void)
}
/**
+ * clocksource_touch_watchdog - Update watchdog
+ *
+ * Update the watchdog after exception contexts such as kgdb so as not
+ * to incorrectly trip the watchdog.
+ *
+ */
+void clocksource_touch_watchdog(void)
+{
+ clocksource_resume_watchdog();
+}
+
+/**
* clocksource_get_next - Returns the selected clocksource
*
*/
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c88b5910e7ab..5fd9b9469770 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
long time_freq; /* frequency offset (scaled ppm)*/
static long time_reftime; /* time at last adjustment (s) */
long time_adjust;
+static long ntp_tick_adj;
static void ntp_update_frequency(void)
{
u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
<< TICK_LENGTH_SHIFT;
- second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+ second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT;
second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
tick_length_base = second_length;
@@ -342,14 +343,16 @@ int do_adjtimex(struct timex *txc)
freq_adj = shift_right(freq_adj, time_constant * 2 +
(SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+ u64 utemp64;
temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
if (time_offset < 0) {
- temp64 = -temp64;
- do_div(temp64, mtemp);
- freq_adj -= temp64;
+ utemp64 = -temp64;
+ do_div(utemp64, mtemp);
+ freq_adj -= utemp64;
} else {
- do_div(temp64, mtemp);
- freq_adj += temp64;
+ utemp64 = temp64;
+ do_div(utemp64, mtemp);
+ freq_adj += utemp64;
}
}
freq_adj += time_freq;
@@ -400,3 +403,11 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
notify_cmos_timer();
return(result);
}
+
+static int __init ntp_tick_adj_setup(char *str)
+{
+ ntp_tick_adj = simple_strtol(str, NULL, 0);
+ return 1;
+}
+
+__setup("ntp_tick_adj=", ntp_tick_adj_setup);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index e1bd50cbbf5d..fdfa0c745bb6 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -14,7 +14,7 @@
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
-#include <linux/irq.h>
+#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 1bea399a9ef0..4f3886562b8c 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -14,12 +14,14 @@
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
-#include <linux/irq.h>
+#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
#include <linux/tick.h>
+#include <asm/irq_regs.h>
+
#include "tick-internal.h"
/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0258d3115d54..450c04935b66 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -14,7 +14,7 @@
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
-#include <linux/irq.h>
+#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fa9bb73dbdb4..686da821d376 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void)
ts->idle_tick = ts->sched_timer.expires;
ts->tick_stopped = 1;
ts->idle_jiffies = last_jiffies;
+ rcu_enter_nohz();
}
/*
@@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void)
return;
}
+ rcu_exit_nohz();
+
/* Update jiffies first */
select_nohz_load_balancer(0);
now = ktime_get();
@@ -637,7 +640,7 @@ void tick_cancel_sched_timer(int cpu)
if (ts->sched_timer.base)
hrtimer_cancel(&ts->sched_timer);
- ts->tick_stopped = 0;
+
ts->nohz_mode = NOHZ_MODE_INACTIVE;
}
#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1af9fb050fe2..a3fa587c350c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -187,13 +187,16 @@ static void change_clocksource(void)
clock->error = 0;
clock->xtime_nsec = 0;
- clocksource_calculate_interval(clock,
- (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
tick_clock_notify();
+ /*
+ * We're holding xtime lock and waking up klogd would deadlock
+ * us on enqueue. So no printing!
printk(KERN_INFO "Time: %s clocksource has been installed.\n",
clock->name);
+ */
}
#else
static inline void change_clocksource(void) { }
@@ -245,8 +248,7 @@ void __init timekeeping_init(void)
ntp_clear();
clock = clocksource_get_next();
- clocksource_calculate_interval(clock,
- (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
clock->cycle_last = clocksource_read(clock);
xtime.tv_sec = sec;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index d3d94c1a0fd2..67fe8fc21fb1 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -65,9 +65,9 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
#endif
SEQ_printf(m, "\n");
- SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n",
+ SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n",
(unsigned long long)ktime_to_ns(timer->expires),
- (unsigned long long)(ktime_to_ns(timer->expires) - now));
+ (long long)(ktime_to_ns(timer->expires) - now));
}
static void
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index 62b1287932ed..41468035473c 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -339,7 +339,7 @@ sub output($@)
print "\n";
foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ',
- 'USEC_TO_HZ','HZ_TO_USEC') {
+ 'HZ_TO_USEC','USEC_TO_HZ') {
foreach $bit (32, 64) {
foreach $suf ('MUL', 'ADJ', 'SHR') {
printf "#define %-23s %s\n",
diff --git a/kernel/timer.c b/kernel/timer.c
index 99b00a25f88b..b024106daa70 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu)
spin_lock_irqsave(&base->lock, flags);
timer_set_base(timer, base);
internal_add_timer(base, timer);
+ /*
+ * Check whether the other CPU is idle and needs to be
+ * triggered to reevaluate the timer wheel when nohz is
+ * active. We are protected against the other CPU fiddling
+ * with the timer by holding the timer base lock. This also
+ * makes sure that a CPU on the way to idle can not evaluate
+ * the timer wheel.
+ */
+ wake_up_idle_cpu(cpu);
spin_unlock_irqrestore(&base->lock, flags);
}
-
/**
* mod_timer - modify a timer's timeout
* @timer: the timer to be modified
diff --git a/kernel/uid16.c b/kernel/uid16.c
index dd308ba4e03b..3e41c1673e2f 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -21,7 +21,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi
{
long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(3, ret, filename, user, group);
return ret;
}
@@ -29,7 +29,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g
{
long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(3, ret, filename, user, group);
return ret;
}
@@ -37,7 +37,7 @@ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
{
long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(3, ret, fd, user, group);
return ret;
}
@@ -45,7 +45,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
{
long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(2, ret, rgid, egid);
return ret;
}
@@ -53,7 +53,7 @@ asmlinkage long sys_setgid16(old_gid_t gid)
{
long ret = sys_setgid(low2highgid(gid));
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(1, ret, gid);
return ret;
}
@@ -61,7 +61,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
{
long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(2, ret, ruid, euid);
return ret;
}
@@ -69,7 +69,7 @@ asmlinkage long sys_setuid16(old_uid_t uid)
{
long ret = sys_setuid(low2highuid(uid));
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(1, ret, uid);
return ret;
}
@@ -78,7 +78,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
low2highuid(suid));
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(3, ret, ruid, euid, suid);
return ret;
}
@@ -98,7 +98,7 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
low2highgid(sgid));
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(3, ret, rgid, egid, sgid);
return ret;
}
@@ -117,7 +117,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid)
{
long ret = sys_setfsuid(low2highuid(uid));
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(1, ret, uid);
return ret;
}
@@ -125,7 +125,7 @@ asmlinkage long sys_setfsgid16(old_gid_t gid)
{
long ret = sys_setfsgid(low2highgid(gid));
/* avoid REGPARM breakage on x86: */
- prevent_tail_call(ret);
+ asmlinkage_protect(1, ret, gid);
return ret;
}
diff --git a/kernel/user.c b/kernel/user.c
index 7d7900c5a1fd..7132022a040c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -57,7 +57,7 @@ struct user_struct root_user = {
.uid_keyring = &root_user_keyring,
.session_keyring = &root_session_keyring,
#endif
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
.tg = &init_task_group,
#endif
};
@@ -90,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
return NULL;
}
-#ifdef CONFIG_FAIR_USER_SCHED
+#ifdef CONFIG_USER_SCHED
static void sched_destroy_user(struct user_struct *up)
{
@@ -113,15 +113,15 @@ static void sched_switch_user(struct task_struct *p)
sched_move_task(p);
}
-#else /* CONFIG_FAIR_USER_SCHED */
+#else /* CONFIG_USER_SCHED */
static void sched_destroy_user(struct user_struct *up) { }
static int sched_create_user(struct user_struct *up) { return 0; }
static void sched_switch_user(struct task_struct *p) { }
-#endif /* CONFIG_FAIR_USER_SCHED */
+#endif /* CONFIG_USER_SCHED */
-#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
+#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
static DEFINE_MUTEX(uids_mutex);
@@ -137,6 +137,7 @@ static inline void uids_mutex_unlock(void)
}
/* uid directory attributes */
+#ifdef CONFIG_FAIR_GROUP_SCHED
static ssize_t cpu_shares_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
@@ -163,10 +164,45 @@ static ssize_t cpu_shares_store(struct kobject *kobj,
static struct kobj_attribute cpu_share_attr =
__ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+
+ return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+}
+
+static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t size)
+{
+ struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+ unsigned long rt_runtime;
+ int rc;
+
+ sscanf(buf, "%lu", &rt_runtime);
+
+ rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
+
+ return (rc ? rc : size);
+}
+
+static struct kobj_attribute cpu_rt_runtime_attr =
+ __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
+#endif
/* default attributes per uid directory */
static struct attribute *uids_attributes[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
&cpu_share_attr.attr,
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ &cpu_rt_runtime_attr.attr,
+#endif
NULL
};
@@ -269,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
schedule_work(&up->work);
}
-#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
+#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
int uids_sysfs_init(void) { return 0; }
static inline int uids_user_create(struct user_struct *up) { return 0; }
@@ -373,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
- /* This case is not possible when CONFIG_FAIR_USER_SCHED
+ /* This case is not possible when CONFIG_USER_SCHED
* is defined, since we serialize alloc_uid() using
* uids_mutex. Hence no need to call
* sched_destroy_user() or remove_user_sysfs_dir().