diff options
Diffstat (limited to 'kernel')
54 files changed, 4316 insertions, 1324 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0669b70fa6a3..9fdba03dc1fc 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -52,8 +52,23 @@ config PREEMPT endchoice +config PREEMPT_RCU + bool "Preemptible RCU" + depends on PREEMPT + default n + help + This option reduces the latency of the kernel by making certain + RCU sections preemptible. Normally RCU code is non-preemptible, if + this option is selected then read-only RCU sections become + preemptible. This helps latency, but may expose bugs due to + now-naive assumptions about each RCU read-side critical section + remaining on a given CPU through its execution. + + Say N if you are unsure. + config RCU_TRACE bool "Enable tracing for RCU - currently stats in debugfs" + depends on PREEMPT_RCU select DEBUG_FS default y help diff --git a/kernel/Makefile b/kernel/Makefile index 6c584c55a6e9..6c5f081132a4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o nsproxy.o srcu.o \ + hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o obj-$(CONFIG_SYSCTL) += sysctl_check.o @@ -53,6 +53,7 @@ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o diff --git a/kernel/acct.c b/kernel/acct.c index 521dfa53cb99..91e1cfd734d2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -58,6 +58,7 @@ #include <asm/uaccess.h> #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ +#include <linux/pid_namespace.h> /* * These constants control the amount of freespace that suspend and @@ -74,7 +75,7 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct file *); +static void do_acct_process(struct pid_namespace *ns, struct file *); /* * This structure is used so that all the data protected by lock @@ -86,6 +87,7 @@ struct acct_glbs { volatile int active; volatile int needcheck; struct file *file; + struct pid_namespace *ns; struct timer_list timer; }; @@ -175,9 +177,11 @@ out: static void acct_file_reopen(struct file *file) { struct file *old_acct = NULL; + struct pid_namespace *old_ns = NULL; if (acct_globals.file) { old_acct = acct_globals.file; + old_ns = acct_globals.ns; del_timer(&acct_globals.timer); acct_globals.active = 0; acct_globals.needcheck = 0; @@ -185,6 +189,7 @@ static void acct_file_reopen(struct file *file) } if (file) { acct_globals.file = file; + acct_globals.ns = get_pid_ns(task_active_pid_ns(current)); acct_globals.needcheck = 0; acct_globals.active = 1; /* It's been deleted if it was used before so this is safe */ @@ -196,8 +201,9 @@ static void acct_file_reopen(struct file *file) if (old_acct) { mnt_unpin(old_acct->f_path.mnt); spin_unlock(&acct_globals.lock); - do_acct_process(old_acct); + do_acct_process(old_ns, old_acct); filp_close(old_acct, NULL); + put_pid_ns(old_ns); spin_lock(&acct_globals.lock); } } @@ -419,7 +425,7 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(struct file *file) +static void do_acct_process(struct pid_namespace *ns, struct file *file) { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; @@ -481,8 +487,10 @@ static void do_acct_process(struct file *file) ac.ac_gid16 = current->gid; #endif #if ACCT_VERSION==3 - ac.ac_pid = current->tgid; - ac.ac_ppid = current->real_parent->tgid; + ac.ac_pid = task_tgid_nr_ns(current, ns); + rcu_read_lock(); + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); + rcu_read_unlock(); #endif spin_lock_irq(¤t->sighand->siglock); @@ -578,6 +586,7 @@ void acct_collect(long exitcode, int group_dead) void acct_process(void) { struct file *file = NULL; + struct pid_namespace *ns; /* * accelerate the common fastpath: @@ -592,8 +601,10 @@ void acct_process(void) return; } get_file(file); + ns = get_pid_ns(acct_globals.ns); spin_unlock(&acct_globals.lock); - do_acct_process(file); + do_acct_process(ns, file); fput(file); + put_pid_ns(ns); } diff --git a/kernel/audit.c b/kernel/audit.c index c8555b180213..b782b046543d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -78,9 +78,13 @@ static int audit_default; /* If auditing cannot proceed, audit_failure selects what happens. */ static int audit_failure = AUDIT_FAIL_PRINTK; -/* If audit records are to be written to the netlink socket, audit_pid - * contains the (non-zero) pid. */ +/* + * If audit records are to be written to the netlink socket, audit_pid + * contains the pid of the auditd process and audit_nlk_pid contains + * the pid to use to send netlink messages to that process. + */ int audit_pid; +static int audit_nlk_pid; /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -170,7 +174,9 @@ void audit_panic(const char *message) printk(KERN_ERR "audit: %s\n", message); break; case AUDIT_FAIL_PANIC: - panic("audit: %s\n", message); + /* test audit_pid since printk is always losey, why bother? */ + if (audit_pid) + panic("audit: %s\n", message); break; } } @@ -348,10 +354,11 @@ static int kauditd_thread(void *dummy) wake_up(&audit_backlog_wait); if (skb) { if (audit_pid) { - int err = netlink_unicast(audit_sock, skb, audit_pid, 0); + int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); + audit_log_lost("auditd dissapeared\n"); audit_pid = 0; } } else { @@ -623,6 +630,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sid, 1); audit_pid = new_pid; + audit_nlk_pid = NETLINK_CB(skb).pid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) err = audit_set_rate_limit(status_get->rate_limit, @@ -1261,8 +1269,8 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, /** * audit_string_contains_control - does a string need to be logged in hex - * @string - string to be checked - * @len - max length of the string to check + * @string: string to be checked + * @len: max length of the string to check */ int audit_string_contains_control(const char *string, size_t len) { @@ -1277,7 +1285,7 @@ int audit_string_contains_control(const char *string, size_t len) /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer - * @len: lenth of string (not including trailing null) + * @len: length of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string @@ -1312,26 +1320,26 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, - struct dentry *dentry, struct vfsmount *vfsmnt) + struct path *path) { - char *p, *path; + char *p, *pathname; if (prefix) audit_log_format(ab, " %s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ - path = kmalloc(PATH_MAX+11, ab->gfp_mask); - if (!path) { + pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); + if (!pathname) { audit_log_format(ab, "<no memory>"); return; } - p = d_path(dentry, vfsmnt, path, PATH_MAX+11); + p = d_path(path, pathname, PATH_MAX+11); if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ /* FIXME: can we save some information here? */ audit_log_format(ab, "<too long>"); } else audit_log_untrustedstring(ab, p); - kfree(path); + kfree(pathname); } /** @@ -1350,17 +1358,19 @@ void audit_log_end(struct audit_buffer *ab) if (!audit_rate_check()) { audit_log_lost("rate limit exceeded"); } else { + struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); if (audit_pid) { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); skb_queue_tail(&audit_skb_queue, ab->skb); ab->skb = NULL; wake_up_interruptible(&kauditd_wait); - } else if (printk_ratelimit()) { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, ab->skb->data + NLMSG_SPACE(0)); - } else { - audit_log_lost("printk limit exceeded\n"); + } else if (nlh->nlmsg_type != AUDIT_EOE) { + if (printk_ratelimit()) { + printk(KERN_NOTICE "type=%d %s\n", + nlh->nlmsg_type, + ab->skb->data + NLMSG_SPACE(0)); + } else + audit_log_lost("printk limit exceeded\n"); } } audit_buffer_free(ab); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f4fcf58f20f8..9ef5e0aacc3c 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -549,8 +549,8 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(nd.mnt, nd.dentry); - path_release(&nd); + root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry); + path_put(&nd.path); if (!root_mnt) goto skip_it; @@ -583,17 +583,17 @@ skip_it: static int is_under(struct vfsmount *mnt, struct dentry *dentry, struct nameidata *nd) { - if (mnt != nd->mnt) { + if (mnt != nd->path.mnt) { for (;;) { if (mnt->mnt_parent == mnt) return 0; - if (mnt->mnt_parent == nd->mnt) + if (mnt->mnt_parent == nd->path.mnt) break; mnt = mnt->mnt_parent; } dentry = mnt->mnt_mountpoint; } - return is_subdir(dentry, nd->dentry); + return is_subdir(dentry, nd->path.dentry); } int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) @@ -641,8 +641,8 @@ int audit_add_tree_rule(struct audit_krule *rule) err = path_lookup(tree->pathname, 0, &nd); if (err) goto Err; - mnt = collect_mounts(nd.mnt, nd.dentry); - path_release(&nd); + mnt = collect_mounts(nd.path.mnt, nd.path.dentry); + path_put(&nd.path); if (!mnt) { err = -ENOMEM; goto Err; @@ -701,8 +701,8 @@ int audit_tag_tree(char *old, char *new) err = path_lookup(new, 0, &nd); if (err) return err; - tagged = collect_mounts(nd.mnt, nd.dentry); - path_release(&nd); + tagged = collect_mounts(nd.path.mnt, nd.path.dentry); + path_put(&nd.path); if (!tagged) return -ENOMEM; @@ -711,9 +711,9 @@ int audit_tag_tree(char *old, char *new) drop_collected_mounts(tagged); return err; } - mnt = mntget(nd.mnt); - dentry = dget(nd.dentry); - path_release(&nd); + mnt = mntget(nd.path.mnt); + dentry = dget(nd.path.dentry); + path_put(&nd.path); if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) follow_up(&mnt, &dentry); @@ -744,13 +744,13 @@ int audit_tag_tree(char *old, char *new) spin_lock(&vfsmount_lock); if (!is_under(mnt, dentry, &nd)) { spin_unlock(&vfsmount_lock); - path_release(&nd); + path_put(&nd.path); put_tree(tree); mutex_lock(&audit_filter_mutex); continue; } spin_unlock(&vfsmount_lock); - path_release(&nd); + path_put(&nd.path); list_for_each_entry(p, &list, mnt_list) { failed = tag_chunk(p->mnt_root->d_inode, tree); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6f19fd477aac..2f2914b7cc30 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -169,8 +169,8 @@ static struct audit_parent *audit_init_parent(struct nameidata *ndp) inotify_init_watch(&parent->wdata); /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode, - AUDIT_IN_WATCH); + wd = inotify_add_watch(audit_ih, &parent->wdata, + ndp->path.dentry->d_inode, AUDIT_IN_WATCH); if (wd < 0) { audit_free_parent(&parent->wdata); return ERR_PTR(wd); @@ -1161,11 +1161,11 @@ static int audit_get_nd(char *path, struct nameidata **ndp, static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) { if (ndp) { - path_release(ndp); + path_put(&ndp->path); kfree(ndp); } if (ndw) { - path_release(ndw); + path_put(&ndw->path); kfree(ndw); } } @@ -1214,8 +1214,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, /* update watch filter fields */ if (ndw) { - watch->dev = ndw->dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->dentry->d_inode->i_ino; + watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; + watch->ino = ndw->path.dentry->d_inode->i_ino; } /* The audit_filter_mutex must not be held during inotify calls because @@ -1225,7 +1225,8 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, */ mutex_unlock(&audit_filter_mutex); - if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) { + if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, + &i_watch) < 0) { parent = audit_init_parent(ndp); if (IS_ERR(parent)) { /* caller expects mutex locked */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c06ecf38d7b..782262e4107d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -208,8 +208,7 @@ struct audit_context { int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ - struct dentry * pwd; - struct vfsmount * pwdmnt; + struct path pwd; struct audit_context *previous; /* For nested syscalls */ struct audit_aux_data *aux; struct audit_aux_data *aux_pids; @@ -786,12 +785,9 @@ static inline void audit_free_names(struct audit_context *context) __putname(context->names[i].name); } context->name_count = 0; - if (context->pwd) - dput(context->pwd); - if (context->pwdmnt) - mntput(context->pwdmnt); - context->pwd = NULL; - context->pwdmnt = NULL; + path_put(&context->pwd); + context->pwd.dentry = NULL; + context->pwd.mnt = NULL; } static inline void audit_free_aux(struct audit_context *context) @@ -930,8 +926,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { audit_log_d_path(ab, "exe=", - vma->vm_file->f_path.dentry, - vma->vm_file->f_path.mnt); + &vma->vm_file->f_path); break; } vma = vma->vm_next; @@ -1005,9 +1000,10 @@ static int audit_log_single_execve_arg(struct audit_context *context, * for strings that are too long, we should not have created * any. */ - if (unlikely((len = -1) || len > MAX_ARG_STRLEN - 1)) { + if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } /* walk the whole argument looking for non-ascii chars */ @@ -1025,6 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, if (ret) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } buf[to_send] = '\0'; has_cntl = audit_string_contains_control(buf, to_send); @@ -1073,7 +1070,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, * so we can be sure nothing was lost. */ if ((i == 0) && (too_long)) - audit_log_format(*ab, "a%d_len=%ld ", arg_num, + audit_log_format(*ab, "a%d_len=%zu ", arg_num, has_cntl ? 2*len : len); /* @@ -1088,6 +1085,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, if (ret) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } buf[to_send] = '\0'; @@ -1341,10 +1339,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->target_sid, context->target_comm)) call_panic = 1; - if (context->pwd && context->pwdmnt) { + if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); + audit_log_d_path(ab, "cwd=", &context->pwd); audit_log_end(ab); } } @@ -1367,8 +1365,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case 0: /* name was specified as a relative path and the * directory component is the cwd */ - audit_log_d_path(ab, " name=", context->pwd, - context->pwdmnt); + audit_log_d_path(ab, " name=", &context->pwd); break; default: /* log the name's directory component */ @@ -1695,10 +1692,10 @@ void __audit_getname(const char *name) context->names[context->name_count].ino = (unsigned long)-1; context->names[context->name_count].osid = 0; ++context->name_count; - if (!context->pwd) { + if (!context->pwd.dentry) { read_lock(¤t->fs->lock); - context->pwd = dget(current->fs->pwd); - context->pwdmnt = mntget(current->fs->pwdmnt); + context->pwd = current->fs->pwd; + path_get(¤t->fs->pwd); read_unlock(¤t->fs->lock); } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4766bb65e4d9..6d8de051382b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -113,9 +113,9 @@ static int root_count; #define dummytop (&rootnode.top_cgroup) /* This flag indicates whether tasks in the fork and exit paths should - * take callback_mutex and check for fork/exit handlers to call. This - * avoids us having to do extra work in the fork/exit path if none of the - * subsystems need to be called. + * check for fork/exit handlers to call. This avoids us having to do + * extra work in the fork/exit path if none of the subsystems need to + * be called. */ static int need_forkexit_callback; @@ -307,7 +307,6 @@ static inline void put_css_set_taskexit(struct css_set *cg) * template: location in which to build the desired set of subsystem * state objects for the new cgroup group */ - static struct css_set *find_existing_css_set( struct css_set *oldcg, struct cgroup *cgrp, @@ -320,7 +319,7 @@ static struct css_set *find_existing_css_set( /* Built the set of subsystem state objects that we want to * see in the new css_set */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (root->subsys_bits & (1ull << i)) { + if (root->subsys_bits & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -354,7 +353,6 @@ static struct css_set *find_existing_css_set( * and chains them on tmp through their cgrp_link_list fields. Returns 0 on * success or a negative error */ - static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; @@ -396,7 +394,6 @@ static void free_cg_links(struct list_head *tmp) * substituted into the appropriate hierarchy. Must be called with * cgroup_mutex held */ - static struct css_set *find_css_set( struct css_set *oldcg, struct cgroup *cgrp) { @@ -473,7 +470,6 @@ static struct css_set *find_css_set( /* Link this cgroup group into the list */ list_add(&res->list, &init_css_set.list); css_set_count++; - INIT_LIST_HEAD(&res->tasks); write_unlock(&css_set_lock); return res; @@ -507,8 +503,8 @@ static struct css_set *find_css_set( * critical pieces of code here. The exception occurs on cgroup_exit(), * when a task in a notify_on_release cgroup exits. Then cgroup_mutex * is taken, and if the cgroup count is zero, a usermode call made - * to /sbin/cgroup_release_agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. + * to the release agent with the name of the cgroup (path relative to + * the root of cgroup file system) as the argument. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all @@ -521,7 +517,7 @@ static struct css_set *find_css_set( * * The need for this exception arises from the action of * cgroup_attach_task(), which overwrites one tasks cgroup pointer with - * another. It does so using cgroup_mutexe, however there are + * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference * task->cgroup without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as @@ -537,7 +533,6 @@ static struct css_set *find_css_set( * cgroup_lock - lock out any changes to cgroup structures * */ - void cgroup_lock(void) { mutex_lock(&cgroup_mutex); @@ -548,7 +543,6 @@ void cgroup_lock(void) * * Undo the lock taken in a previous cgroup_lock() call. */ - void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); @@ -590,7 +584,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) * Call subsys's pre_destroy handler. * This is called before css refcnt check. */ - static void cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -600,7 +593,6 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) return; } - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -696,7 +688,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, added_bits = final_bits & ~root->actual_subsys_bits; /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long long bit = 1ull << i; + unsigned long bit = 1UL << i; struct cgroup_subsys *ss = subsys[i]; if (!(bit & added_bits)) continue; @@ -790,7 +782,14 @@ static int parse_cgroupfs_options(char *data, if (!*token) return -EINVAL; if (!strcmp(token, "all")) { - opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1; + /* Add all non-disabled subsystems */ + int i; + opts->subsys_bits = 0; + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (!ss->disabled) + opts->subsys_bits |= 1ul << i; + } } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { @@ -808,7 +807,8 @@ static int parse_cgroupfs_options(char *data, for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ss = subsys[i]; if (!strcmp(token, ss->name)) { - set_bit(i, &opts->subsys_bits); + if (!ss->disabled) + set_bit(i, &opts->subsys_bits); break; } } @@ -927,7 +927,6 @@ static int cgroup_get_rootdir(struct super_block *sb) if (!inode) return -ENOMEM; - inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_op = &cgroup_dir_inode_operations; /* directories start off with i_nlink == 2 (for "." entry) */ @@ -961,8 +960,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, } root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) + if (!root) { + if (opts.release_agent) + kfree(opts.release_agent); return -ENOMEM; + } init_cgroup_root(root); root->subsys_bits = opts.subsys_bits; @@ -1129,8 +1131,13 @@ static inline struct cftype *__d_cft(struct dentry *dentry) return dentry->d_fsdata; } -/* - * Called with cgroup_mutex held. Writes path of cgroup into buf. +/** + * cgroup_path - generate the path of a cgroup + * @cgrp: the cgroup in question + * @buf: the buffer to write the path into + * @buflen: the length of the buffer + * + * Called with cgroup_mutex held. Writes path of cgroup into buf. * Returns 0 on success, -errno on error. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) @@ -1188,11 +1195,13 @@ static void get_first_subsys(const struct cgroup *cgrp, *subsys_id = test_ss->subsys_id; } -/* - * Attach task 'tsk' to cgroup 'cgrp' +/** + * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' + * @cgrp: the cgroup the task is attaching to + * @tsk: the task to be attached * - * Call holding cgroup_mutex. May take task_lock of - * the task 'pid' during call. + * Call holding cgroup_mutex. May take task_lock of + * the task 'tsk' during call. */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { @@ -1293,7 +1302,6 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) } /* The various types of files and directories in a cgroup file system */ - enum cgroup_filetype { FILE_ROOT, FILE_DIR, @@ -1584,12 +1592,11 @@ static int cgroup_create_file(struct dentry *dentry, int mode, } /* - * cgroup_create_dir - create a directory for an object. - * cgrp: the cgroup we create the directory for. - * It must have a valid ->parent field - * And we are going to fill its ->dentry field. - * dentry: dentry of the new cgroup - * mode: mode to set on new directory. + * cgroup_create_dir - create a directory for an object. + * @cgrp: the cgroup we create the directory for. It must have a valid + * ->parent field. And we are going to fill its ->dentry field. + * @dentry: dentry of the new cgroup + * @mode: mode to set on new directory. */ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, int mode) @@ -1651,8 +1658,12 @@ int cgroup_add_files(struct cgroup *cgrp, return 0; } -/* Count the number of tasks in a cgroup. */ - +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. + */ int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; @@ -1711,7 +1722,12 @@ void cgroup_enable_task_cg_lists(void) use_task_css_set_links = 1; do_each_thread(g, p) { task_lock(p); - if (list_empty(&p->cg_list)) + /* + * We should check if the process is exiting, otherwise + * it will race with cgroup_exit() in that the list + * entry won't be deleted though the process has exited. + */ + if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); } while_each_thread(g, p); @@ -1962,12 +1978,13 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) } /** - * Build and fill cgroupstats so that taskstats can export it to user - * space. - * + * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { @@ -2078,7 +2095,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) kfree(pidarray); } else { - ctr->buf = 0; + ctr->buf = NULL; ctr->bufsz = 0; } file->private_data = ctr; @@ -2199,14 +2216,13 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, } /* - * cgroup_create - create a cgroup - * parent: cgroup that will be parent of the new cgroup. - * name: name of the new cgroup. Will be strcpy'ed. - * mode: mode to set on new inode + * cgroup_create - create a cgroup + * @parent: cgroup that will be parent of the new cgroup + * @dentry: dentry of the new cgroup + * @mode: mode to set on new inode * - * Must be called with the mutex on the parent inode held + * Must be called with the mutex on the parent inode held */ - static long cgroup_create(struct cgroup *parent, struct dentry *dentry, int mode) { @@ -2229,7 +2245,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, mutex_lock(&cgroup_mutex); - cgrp->flags = 0; INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); @@ -2239,6 +2254,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->root = parent->root; cgrp->top_cgroup = parent->top_cgroup; + if (notify_on_release(parent)) + set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); if (IS_ERR(css)) { @@ -2349,13 +2367,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) parent = cgrp->parent; root = cgrp->root; sb = root->sb; + /* - * Call pre_destroy handlers of subsys + * Call pre_destroy handlers of subsys. Notify subsystems + * that rmdir() request comes. */ cgroup_call_pre_destroy(cgrp); - /* - * Notify subsyses that rmdir() request comes. - */ if (cgroup_has_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); @@ -2431,8 +2448,10 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss) } /** - * cgroup_init_early - initialize cgroups at system boot, and - * initialize any subsystems that request early init. + * cgroup_init_early - cgroup initialization at system boot + * + * Initialize cgroups at system boot, and initialize any + * subsystems that request early init. */ int __init cgroup_init_early(void) { @@ -2474,8 +2493,10 @@ int __init cgroup_init_early(void) } /** - * cgroup_init - register cgroup filesystem and /proc file, and - * initialize any subsystems that didn't request early init. + * cgroup_init - cgroup initialization + * + * Register cgroup filesystem and /proc file, and initialize + * any subsystems that didn't request early init. */ int __init cgroup_init(void) { @@ -2553,6 +2574,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v) /* Skip this hierarchy if it has no active subsystems */ if (!root->actual_subsys_bits) continue; + seq_printf(m, "%lu:", root->subsys_bits); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); seq_putc(m, ':'); @@ -2592,13 +2614,13 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) { int i; - seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n"); + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - seq_printf(m, "%s\t%lu\t%d\n", + seq_printf(m, "%s\t%lu\t%d\t%d\n", ss->name, ss->root->subsys_bits, - ss->root->number_of_cgroups); + ss->root->number_of_cgroups, !ss->disabled); } mutex_unlock(&cgroup_mutex); return 0; @@ -2606,7 +2628,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) static int cgroupstats_open(struct inode *inode, struct file *file) { - return single_open(file, proc_cgroupstats_show, 0); + return single_open(file, proc_cgroupstats_show, NULL); } static struct file_operations proc_cgroupstats_operations = { @@ -2618,7 +2640,7 @@ static struct file_operations proc_cgroupstats_operations = { /** * cgroup_fork - attach newly forked task to its parents cgroup. - * @tsk: pointer to task_struct of forking parent process. + * @child: pointer to task_struct of forking parent process. * * Description: A task inherits its parent's cgroup at fork(). * @@ -2642,9 +2664,12 @@ void cgroup_fork(struct task_struct *child) } /** - * cgroup_fork_callbacks - called on a new task very soon before - * adding it to the tasklist. No need to take any locks since no-one - * can be operating on this task + * cgroup_fork_callbacks - run fork callbacks + * @child: the new task + * + * Called on a new task very soon before adding it to the + * tasklist. No need to take any locks since no-one can + * be operating on this task. */ void cgroup_fork_callbacks(struct task_struct *child) { @@ -2659,11 +2684,14 @@ void cgroup_fork_callbacks(struct task_struct *child) } /** - * cgroup_post_fork - called on a new task after adding it to the - * task list. Adds the task to the list running through its css_set - * if necessary. Has to be after the task is visible on the task list - * in case we race with the first call to cgroup_iter_start() - to - * guarantee that the new task ends up on its list. */ + * cgroup_post_fork - called on a new task after adding it to the task list + * @child: the task in question + * + * Adds the task to the list running through its css_set if necessary. + * Has to be after the task is visible on the task list in case we race + * with the first call to cgroup_iter_start() - to guarantee that the + * new task ends up on its list. + */ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { @@ -2676,6 +2704,7 @@ void cgroup_post_fork(struct task_struct *child) /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process + * @run_callback: run exit callbacks? * * Description: Detach cgroup from @tsk and release it. * @@ -2706,7 +2735,6 @@ void cgroup_post_fork(struct task_struct *child) * top_cgroup isn't going away, and either task has PF_EXITING set, * which wards off any cgroup_attach_task() attempts, or task is a failed * fork, never visible to cgroup_attach_task. - * */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { @@ -2743,9 +2771,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } /** - * cgroup_clone - duplicate the current cgroup in the hierarchy - * that the given subsystem is attached to, and move this task into - * the new child + * cgroup_clone - clone the cgroup the given subsystem is attached to + * @tsk: the task to be moved + * @subsys: the given subsystem + * + * Duplicate the current cgroup in the hierarchy that the given + * subsystem is attached to, and move this task into the new + * child. */ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) { @@ -2858,9 +2890,12 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) return ret; } -/* - * See if "cgrp" is a descendant of the current task's cgroup in - * the appropriate hierarchy +/** + * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp + * @cgrp: the cgroup in question + * + * See if @cgrp is a descendant of the current task's cgroup in + * the appropriate hierarchy. * * If we are sending in dummytop, then presumably we are creating * the top cgroup in the subsystem. @@ -2939,9 +2974,7 @@ void __css_put(struct cgroup_subsys_state *css) * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. - * */ - static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); @@ -2991,3 +3024,27 @@ static void cgroup_release_agent(struct work_struct *work) spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } + +static int __init cgroup_disable(char *str) +{ + int i; + char *token; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + + if (!strcmp(token, ss->name)) { + ss->disabled = 1; + printk(KERN_INFO "Disabling %s control group" + " subsystem\n", ss->name); + break; + } + } + } + return 1; +} +__setup("cgroup_disable=", cgroup_disable); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e296ed81d4d..a1b61f414228 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -322,8 +322,8 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * Call without callback_mutex or task_lock() held. May be * called with or without cgroup_mutex held. Thanks in part to * 'the_top_cpuset_hack', the task's cpuset pointer will never - * be NULL. This routine also might acquire callback_mutex and - * current->mm->mmap_sem during call. + * be NULL. This routine also might acquire callback_mutex during + * call. * * Reading current->cpuset->mems_generation doesn't need task_lock * to guard the current->cpuset derefence, because it is guarded diff --git a/kernel/exit.c b/kernel/exit.c index 3b893e78ce61..073005b1cfb2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -214,20 +214,19 @@ struct pid *session_of_pgrp(struct pid *pgrp) static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; - int ret = 1; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (p == ignored_task - || p->exit_state - || is_global_init(p->real_parent)) + if ((p == ignored_task) || + (p->exit_state && thread_group_empty(p)) || + is_global_init(p->real_parent)) continue; + if (task_pgrp(p->real_parent) != pgrp && - task_session(p->real_parent) == task_session(p)) { - ret = 0; - break; - } + task_session(p->real_parent) == task_session(p)) + return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return ret; /* (sighing) "Often!" */ + + return 1; } int is_current_pgrp_orphaned(void) @@ -255,6 +254,37 @@ static int has_stopped_jobs(struct pid *pgrp) return retval; } +/* + * Check to see if any process groups have become orphaned as + * a result of our exiting, and if they have any stopped jobs, + * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void +kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) +{ + struct pid *pgrp = task_pgrp(tsk); + struct task_struct *ignored_task = tsk; + + if (!parent) + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ + parent = tsk->real_parent; + else + /* reparent: our child is in a different pgrp than + * we are, and it was the only connection outside. + */ + ignored_task = NULL; + + if (task_pgrp(parent) != pgrp && + task_session(parent) == task_session(tsk) && + will_become_orphaned_pgrp(pgrp, ignored_task) && + has_stopped_jobs(pgrp)) { + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); + } +} + /** * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd * @@ -512,14 +542,10 @@ static void __put_fs_struct(struct fs_struct *fs) { /* No need to hold fs->lock if we are killing it */ if (atomic_dec_and_test(&fs->count)) { - dput(fs->root); - mntput(fs->rootmnt); - dput(fs->pwd); - mntput(fs->pwdmnt); - if (fs->altroot) { - dput(fs->altroot); - mntput(fs->altrootmnt); - } + path_put(&fs->root); + path_put(&fs->pwd); + if (fs->altroot.dentry) + path_put(&fs->altroot); kmem_cache_free(fs_cachep, fs); } } @@ -639,22 +665,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) p->exit_signal != -1 && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); - /* - * process group orphan check - * Case ii: Our child is in a different pgrp - * than we are, and it was the only connection - * outside, so the child pgrp is now orphaned. - */ - if ((task_pgrp(p) != task_pgrp(father)) && - (task_session(p) == task_session(father))) { - struct pid *pgrp = task_pgrp(p); - - if (will_become_orphaned_pgrp(pgrp, NULL) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } - } + kill_orphaned_pgrp(p, father); } /* @@ -739,11 +750,9 @@ static void forget_original_parent(struct task_struct *father) * Send signals to all our closest relatives so that they know * to properly mourn us.. */ -static void exit_notify(struct task_struct *tsk) +static void exit_notify(struct task_struct *tsk, int group_dead) { int state; - struct task_struct *t; - struct pid *pgrp; /* * This does two things: @@ -757,25 +766,8 @@ static void exit_notify(struct task_struct *tsk) exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); - /* - * Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - * - * Case i: Our father is in a different pgrp than we are - * and we were the only connection outside, so our pgrp - * is about to become orphaned. - */ - t = tsk->real_parent; - - pgrp = task_pgrp(tsk); - if ((task_pgrp(t) != pgrp) && - (task_session(t) == task_session(tsk)) && - will_become_orphaned_pgrp(pgrp, tsk) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } + if (group_dead) + kill_orphaned_pgrp(tsk->group_leader, NULL); /* Let father know we died * @@ -792,8 +784,8 @@ static void exit_notify(struct task_struct *tsk) * the same after a fork. */ if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && - ( tsk->parent_exec_id != t->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id) + (tsk->parent_exec_id != tsk->real_parent->self_exec_id || + tsk->self_exec_id != tsk->parent_exec_id) && !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; @@ -990,7 +982,7 @@ NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); proc_exit_connector(tsk); - exit_notify(tsk); + exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; @@ -1386,7 +1378,7 @@ unlock_sig: if (!retval && infop) retval = put_user(0, &infop->si_errno); if (!retval && infop) - retval = put_user(why, &infop->si_code); + retval = put_user((short)why, &infop->si_code); if (!retval && infop) retval = put_user(exit_code, &infop->si_status); if (!retval && infop) @@ -1616,7 +1608,7 @@ asmlinkage long sys_waitid(int which, pid_t upid, put_pid(pid); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(5, ret, which, upid, infop, options, ru); return ret; } @@ -1648,7 +1640,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, put_pid(pid); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index 4363a4eb84e3..9c042f901570 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -394,7 +394,6 @@ void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); mm_free_pgd(mm); - mm_free_cgroup(mm); destroy_context(mm); free_mm(mm); } @@ -416,6 +415,7 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + mm_free_cgroup(mm); mmdrop(mm); } } @@ -600,16 +600,16 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old) rwlock_init(&fs->lock); fs->umask = old->umask; read_lock(&old->lock); - fs->rootmnt = mntget(old->rootmnt); - fs->root = dget(old->root); - fs->pwdmnt = mntget(old->pwdmnt); - fs->pwd = dget(old->pwd); - if (old->altroot) { - fs->altrootmnt = mntget(old->altrootmnt); - fs->altroot = dget(old->altroot); + fs->root = old->root; + path_get(&old->root); + fs->pwd = old->pwd; + path_get(&old->pwd); + if (old->altroot.dentry) { + fs->altroot = old->altroot; + path_get(&old->altroot); } else { - fs->altrootmnt = NULL; - fs->altroot = NULL; + fs->altroot.mnt = NULL; + fs->altroot.dentry = NULL; } read_unlock(&old->lock); } diff --git a/kernel/futex.c b/kernel/futex.c index a6baaec44b8f..e43945e995f5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -60,6 +60,8 @@ #include "rtmutex_common.h" +int __read_mostly futex_cmpxchg_enabled; + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -279,7 +281,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, */ static void get_futex_key_refs(union futex_key *key) { - if (key->both.ptr == 0) + if (key->both.ptr == NULL) return; switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: @@ -469,6 +471,8 @@ void exit_pi_state_list(struct task_struct *curr) struct futex_hash_bucket *hb; union futex_key key; + if (!futex_cmpxchg_enabled) + return; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful @@ -1870,6 +1874,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len) { + if (!futex_cmpxchg_enabled) + return -ENOSYS; /* * The kernel knows only one size for now: */ @@ -1894,6 +1900,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, struct robust_list_head __user *head; unsigned long ret; + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (!pid) head = current->robust_list; else { @@ -1997,6 +2006,9 @@ void exit_robust_list(struct task_struct *curr) unsigned long futex_offset; int rc; + if (!futex_cmpxchg_enabled) + return; + /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -2051,7 +2063,7 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int ret; + int ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; struct rw_semaphore *fshared = NULL; @@ -2083,13 +2095,16 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); break; case FUTEX_UNLOCK_PI: - ret = futex_unlock_pi(uaddr, fshared); + if (futex_cmpxchg_enabled) + ret = futex_unlock_pi(uaddr, fshared); break; case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; default: ret = -ENOSYS; @@ -2116,7 +2131,7 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, t = timespec_to_ktime(ts); if (cmd == FUTEX_WAIT) - t = ktime_add(ktime_get(), t); + t = ktime_add_safe(ktime_get(), t); tp = &t; } /* @@ -2143,10 +2158,31 @@ static struct file_system_type futex_fs_type = { .kill_sb = kill_anon_super, }; -static int __init init(void) +static int __init futex_init(void) { - int i = register_filesystem(&futex_fs_type); + u32 curval; + int i; + + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non functional ones will return + * -ENOSYS. + */ + curval = cmpxchg_futex_value_locked(NULL, 0, 0); + if (curval == -EFAULT) + futex_cmpxchg_enabled = 1; + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); + spin_lock_init(&futex_queues[i].lock); + } + + i = register_filesystem(&futex_fs_type); if (i) return i; @@ -2156,10 +2192,6 @@ static int __init init(void) return PTR_ERR(futex_mnt); } - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); - spin_lock_init(&futex_queues[i].lock); - } return 0; } -__initcall(init); +__initcall(futex_init); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 133d558db452..04ac3a9e42cf 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -30,7 +30,7 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, return 0; } -static void __user *futex_uaddr(struct robust_list *entry, +static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); @@ -54,6 +54,9 @@ void compat_exit_robust_list(struct task_struct *curr) compat_long_t futex_offset; int rc; + if (!futex_cmpxchg_enabled) + return; + /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -115,6 +118,9 @@ asmlinkage long compat_sys_set_robust_list(struct compat_robust_list_head __user *head, compat_size_t len) { + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (unlikely(len != sizeof(*head))) return -EINVAL; @@ -130,6 +136,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, struct compat_robust_list_head __user *head; unsigned long ret; + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (!pid) head = current->compat_robust_list; else { @@ -176,7 +185,7 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, t = timespec_to_ktime(ts); if (cmd == FUTEX_WAIT) - t = ktime_add(ktime_get(), t); + t = ktime_add_safe(ktime_get(), t); tp = &t; } if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 3f4a57c7895d..98bee013f71f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -326,6 +326,23 @@ u64 ktime_divns(const ktime_t kt, s64 div) #endif /* BITS_PER_LONG >= 64 */ /* + * Add two ktime values and do a safety check for overflow: + */ +ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) +{ + ktime_t res = ktime_add(lhs, rhs); + + /* + * We use KTIME_SEC_MAX here, the maximum timeout which we can + * return to user space in a timespec: + */ + if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64) + res = ktime_set(KTIME_SEC_MAX, 0); + + return res; +} + +/* * Check, whether the timer is on the callback pending list */ static inline int hrtimer_cb_pending(const struct hrtimer *timer) @@ -425,6 +442,8 @@ static int hrtimer_reprogram(struct hrtimer *timer, ktime_t expires = ktime_sub(timer->expires, base->offset); int res; + WARN_ON_ONCE(timer->expires.tv64 < 0); + /* * When the callback is running, we do not reprogram the clock event * device. The timer callback is either running on a different CPU or @@ -435,6 +454,15 @@ static int hrtimer_reprogram(struct hrtimer *timer, if (hrtimer_callback_running(timer)) return 0; + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Nothing wrong + * about that, just avoid to call into the tick code, which + * has now objections against negative expiry values. + */ + if (expires.tv64 < 0) + return -ETIME; + if (expires.tv64 >= expires_next->tv64) return 0; @@ -682,13 +710,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) */ orun++; } - timer->expires = ktime_add(timer->expires, interval); - /* - * Make sure, that the result did not wrap with a very large - * interval. - */ - if (timer->expires.tv64 < 0) - timer->expires = ktime_set(KTIME_SEC_MAX, 0); + timer->expires = ktime_add_safe(timer->expires, interval); return orun; } @@ -839,7 +861,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) new_base = switch_hrtimer_base(timer, base); if (mode == HRTIMER_MODE_REL) { - tim = ktime_add(tim, new_base->get_time()); + tim = ktime_add_safe(tim, new_base->get_time()); /* * CONFIG_TIME_LOW_RES is a temporary way for architectures * to signal that they simply return xtime in @@ -848,16 +870,8 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) * timeouts. This will go away with the GTOD framework. */ #ifdef CONFIG_TIME_LOW_RES - tim = ktime_add(tim, base->resolution); + tim = ktime_add_safe(tim, base->resolution); #endif - /* - * Careful here: User space might have asked for a - * very long sleep, so the add above might result in a - * negative number, which enqueues the timer in front - * of the queue. - */ - if (tim.tv64 < 0) - tim.tv64 = KTIME_MAX; } timer->expires = tim; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cc54c6276356..fdb3fbe2b0c4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -246,6 +246,17 @@ static unsigned int default_startup(unsigned int irq) } /* + * default shutdown function + */ +static void default_shutdown(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + desc->chip->mask(irq); + desc->status |= IRQ_MASKED; +} + +/* * Fixup enable/disable function pointers */ void irq_chip_set_defaults(struct irq_chip *chip) @@ -256,8 +267,15 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->disable = default_disable; if (!chip->startup) chip->startup = default_startup; + /* + * We use chip->disable, when the user provided its own. When + * we have default_disable set for chip->disable, then we need + * to use default_shutdown, otherwise the irq line is not + * disabled on free_irq(): + */ if (!chip->shutdown) - chip->shutdown = chip->disable; + chip->shutdown = chip->disable != default_disable ? + chip->disable : default_shutdown; if (!chip->name) chip->name = chip->typename; if (!chip->end) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a6b2bc831dd0..088dabbf2d6a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -6,6 +6,7 @@ * This file contains spurious interrupt handling. */ +#include <linux/jiffies.h> #include <linux/irq.h> #include <linux/module.h> #include <linux/kallsyms.h> @@ -179,7 +180,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * otherwise the couter becomes a doomsday timer for otherwise * working systems */ - if (jiffies - desc->last_unhandled > HZ/10) + if (time_after(jiffies, desc->last_unhandled + HZ/10)) desc->irqs_unhandled = 1; else desc->irqs_unhandled++; diff --git a/kernel/kgdb.c b/kernel/kgdb.c new file mode 100644 index 000000000000..1bd0ec1c80b2 --- /dev/null +++ b/kernel/kgdb.c @@ -0,0 +1,1700 @@ +/* + * KGDB stub. + * + * Maintainer: Jason Wessel <jason.wessel@windriver.com> + * + * Copyright (C) 2000-2001 VERITAS Software Corporation. + * Copyright (C) 2002-2004 Timesys Corporation + * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> + * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> + * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. + * Copyright (C) 2005-2008 Wind River Systems, Inc. + * Copyright (C) 2007 MontaVista Software, Inc. + * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Contributors at various stages not listed above: + * Jason Wessel ( jason.wessel@windriver.com ) + * George Anzinger <george@mvista.com> + * Anurekh Saxena (anurekh.saxena@timesys.com) + * Lake Stevens Instrument Division (Glenn Engel) + * Jim Kingdon, Cygnus Support. + * + * Original KGDB stub: David Grothe <dave@gcom.com>, + * Tigran Aivazian <tigran@sco.com> + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ +#include <linux/pid_namespace.h> +#include <linux/clocksource.h> +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/console.h> +#include <linux/threads.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/reboot.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/sysrq.h> +#include <linux/init.h> +#include <linux/kgdb.h> +#include <linux/pid.h> +#include <linux/smp.h> +#include <linux/mm.h> + +#include <asm/cacheflush.h> +#include <asm/byteorder.h> +#include <asm/atomic.h> +#include <asm/system.h> + +static int kgdb_break_asap; + +struct kgdb_state { + int ex_vector; + int signo; + int err_code; + int cpu; + int pass_exception; + long threadid; + long kgdb_usethreadid; + struct pt_regs *linux_regs; +}; + +static struct debuggerinfo_struct { + void *debuggerinfo; + struct task_struct *task; +} kgdb_info[NR_CPUS]; + +/** + * kgdb_connected - Is a host GDB connected to us? + */ +int kgdb_connected; +EXPORT_SYMBOL_GPL(kgdb_connected); + +/* All the KGDB handlers are installed */ +static int kgdb_io_module_registered; + +/* Guard for recursive entry */ +static int exception_level; + +static struct kgdb_io *kgdb_io_ops; +static DEFINE_SPINLOCK(kgdb_registration_lock); + +/* kgdb console driver is loaded */ +static int kgdb_con_registered; +/* determine if kgdb console output should be used */ +static int kgdb_use_con; + +static int __init opt_kgdb_con(char *str) +{ + kgdb_use_con = 1; + return 0; +} + +early_param("kgdbcon", opt_kgdb_con); + +module_param(kgdb_use_con, int, 0644); + +/* + * Holds information about breakpoints in a kernel. These breakpoints are + * added and removed by gdb. + */ +static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { + [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED } +}; + +/* + * The CPU# of the active CPU, or -1 if none: + */ +atomic_t kgdb_active = ATOMIC_INIT(-1); + +/* + * We use NR_CPUs not PERCPU, in case kgdb is used to debug early + * bootup code (which might not have percpu set up yet): + */ +static atomic_t passive_cpu_wait[NR_CPUS]; +static atomic_t cpu_in_kgdb[NR_CPUS]; +atomic_t kgdb_setting_breakpoint; + +struct task_struct *kgdb_usethread; +struct task_struct *kgdb_contthread; + +int kgdb_single_step; + +/* Our I/O buffers. */ +static char remcom_in_buffer[BUFMAX]; +static char remcom_out_buffer[BUFMAX]; + +/* Storage for the registers, in GDB format. */ +static unsigned long gdb_regs[(NUMREGBYTES + + sizeof(unsigned long) - 1) / + sizeof(unsigned long)]; + +/* to keep track of the CPU which is doing the single stepping*/ +atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1); + +/* + * If you are debugging a problem where roundup (the collection of + * all other CPUs) is a problem [this should be extremely rare], + * then use the nokgdbroundup option to avoid roundup. In that case + * the other CPUs might interfere with your debugging context, so + * use this with care: + */ +int kgdb_do_roundup = 1; + +static int __init opt_nokgdbroundup(char *str) +{ + kgdb_do_roundup = 0; + + return 0; +} + +early_param("nokgdbroundup", opt_nokgdbroundup); + +/* + * Finally, some KGDB code :-) + */ + +/* + * Weak aliases for breakpoint management, + * can be overriden by architectures when needed: + */ +int __weak kgdb_validate_break_address(unsigned long addr) +{ + char tmp_variable[BREAK_INSTR_SIZE]; + + return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE); +} + +int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) +{ + int err; + + err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); + if (err) + return err; + + return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); +} + +int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) +{ + return probe_kernel_write((char *)addr, + (char *)bundle, BREAK_INSTR_SIZE); +} + +unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) +{ + return instruction_pointer(regs); +} + +int __weak kgdb_arch_init(void) +{ + return 0; +} + +int __weak kgdb_skipexception(int exception, struct pt_regs *regs) +{ + return 0; +} + +void __weak +kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) +{ + return; +} + +/** + * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. + * @regs: Current &struct pt_regs. + * + * This function will be called if the particular architecture must + * disable hardware debugging while it is processing gdb packets or + * handling exception. + */ +void __weak kgdb_disable_hw_debug(struct pt_regs *regs) +{ +} + +/* + * GDB remote protocol parser: + */ + +static const char hexchars[] = "0123456789abcdef"; + +static int hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return ch - 'a' + 10; + if ((ch >= '0') && (ch <= '9')) + return ch - '0'; + if ((ch >= 'A') && (ch <= 'F')) + return ch - 'A' + 10; + return -1; +} + +/* scan for the sequence $<data>#<checksum> */ +static void get_packet(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int count; + char ch; + + do { + /* + * Spin and wait around for the start character, ignore all + * other characters: + */ + while ((ch = (kgdb_io_ops->read_char())) != '$') + /* nothing */; + + kgdb_connected = 1; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* + * now, read until a # or end of buffer is found: + */ + while (count < (BUFMAX - 1)) { + ch = kgdb_io_ops->read_char(); + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(kgdb_io_ops->read_char()) << 4; + xmitcsum += hex(kgdb_io_ops->read_char()); + + if (checksum != xmitcsum) + /* failed checksum */ + kgdb_io_ops->write_char('-'); + else + /* successful transfer */ + kgdb_io_ops->write_char('+'); + if (kgdb_io_ops->flush) + kgdb_io_ops->flush(); + } + } while (checksum != xmitcsum); +} + +/* + * Send the packet in buffer. + * Check for gdb connection if asked for. + */ +static void put_packet(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* + * $<packet info>#<checksum>. + */ + while (1) { + kgdb_io_ops->write_char('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + kgdb_io_ops->write_char(ch); + checksum += ch; + count++; + } + + kgdb_io_ops->write_char('#'); + kgdb_io_ops->write_char(hexchars[checksum >> 4]); + kgdb_io_ops->write_char(hexchars[checksum & 0xf]); + if (kgdb_io_ops->flush) + kgdb_io_ops->flush(); + + /* Now see what we get in reply. */ + ch = kgdb_io_ops->read_char(); + + if (ch == 3) + ch = kgdb_io_ops->read_char(); + + /* If we get an ACK, we are done. */ + if (ch == '+') + return; + + /* + * If we get the start of another packet, this means + * that GDB is attempting to reconnect. We will NAK + * the packet being sent, and stop trying to send this + * packet. + */ + if (ch == '$') { + kgdb_io_ops->write_char('-'); + if (kgdb_io_ops->flush) + kgdb_io_ops->flush(); + return; + } + } +} + +static char *pack_hex_byte(char *pkt, u8 byte) +{ + *pkt++ = hexchars[byte >> 4]; + *pkt++ = hexchars[byte & 0xf]; + + return pkt; +} + +/* + * Convert the memory pointed to by mem into hex, placing result in buf. + * Return a pointer to the last char put in buf (null). May return an error. + */ +int kgdb_mem2hex(char *mem, char *buf, int count) +{ + char *tmp; + int err; + + /* + * We use the upper half of buf as an intermediate buffer for the + * raw memory copy. Hex conversion will work against this one. + */ + tmp = buf + count; + + err = probe_kernel_read(tmp, mem, count); + if (!err) { + while (count > 0) { + buf = pack_hex_byte(buf, *tmp); + tmp++; + count--; + } + + *buf = 0; + } + + return err; +} + +/* + * Copy the binary array pointed to by buf into mem. Fix $, #, and + * 0x7d escaped with 0x7d. Return a pointer to the character after + * the last byte written. + */ +static int kgdb_ebin2mem(char *buf, char *mem, int count) +{ + int err = 0; + char c; + + while (count-- > 0) { + c = *buf++; + if (c == 0x7d) + c = *buf++ ^ 0x20; + + err = probe_kernel_write(mem, &c, 1); + if (err) + break; + + mem++; + } + + return err; +} + +/* + * Convert the hex array pointed to by buf into binary to be placed in mem. + * Return a pointer to the character AFTER the last byte written. + * May return an error. + */ +int kgdb_hex2mem(char *buf, char *mem, int count) +{ + char *tmp_raw; + char *tmp_hex; + + /* + * We use the upper half of buf as an intermediate buffer for the + * raw memory that is converted from hex. + */ + tmp_raw = buf + count * 2; + + tmp_hex = tmp_raw - 1; + while (tmp_hex >= buf) { + tmp_raw--; + *tmp_raw = hex(*tmp_hex--); + *tmp_raw |= hex(*tmp_hex--) << 4; + } + + return probe_kernel_write(mem, tmp_raw, count); +} + +/* + * While we find nice hex chars, build a long_val. + * Return number of chars processed. + */ +int kgdb_hex2long(char **ptr, long *long_val) +{ + int hex_val; + int num = 0; + + *long_val = 0; + + while (**ptr) { + hex_val = hex(**ptr); + if (hex_val < 0) + break; + + *long_val = (*long_val << 4) | hex_val; + num++; + (*ptr)++; + } + + return num; +} + +/* Write memory due to an 'M' or 'X' packet. */ +static int write_mem_msg(int binary) +{ + char *ptr = &remcom_in_buffer[1]; + unsigned long addr; + unsigned long length; + int err; + + if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && + kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { + if (binary) + err = kgdb_ebin2mem(ptr, (char *)addr, length); + else + err = kgdb_hex2mem(ptr, (char *)addr, length); + if (err) + return err; + if (CACHE_FLUSH_IS_SAFE) + flush_icache_range(addr, addr + length + 1); + return 0; + } + + return -EINVAL; +} + +static void error_packet(char *pkt, int error) +{ + error = -error; + pkt[0] = 'E'; + pkt[1] = hexchars[(error / 10)]; + pkt[2] = hexchars[(error % 10)]; + pkt[3] = '\0'; +} + +/* + * Thread ID accessors. We represent a flat TID space to GDB, where + * the per CPU idle threads (which under Linux all have PID 0) are + * remapped to negative TIDs. + */ + +#define BUF_THREAD_ID_SIZE 16 + +static char *pack_threadid(char *pkt, unsigned char *id) +{ + char *limit; + + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *id++); + + return pkt; +} + +static void int_to_threadref(unsigned char *id, int value) +{ + unsigned char *scan; + int i = 4; + + scan = (unsigned char *)id; + while (i--) + *scan++ = 0; + *scan++ = (value >> 24) & 0xff; + *scan++ = (value >> 16) & 0xff; + *scan++ = (value >> 8) & 0xff; + *scan++ = (value & 0xff); +} + +static struct task_struct *getthread(struct pt_regs *regs, int tid) +{ + /* + * Non-positive TIDs are remapped idle tasks: + */ + if (tid <= 0) + return idle_task(-tid); + + /* + * find_task_by_pid_ns() does not take the tasklist lock anymore + * but is nicely RCU locked - hence is a pretty resilient + * thing to use: + */ + return find_task_by_pid_ns(tid, &init_pid_ns); +} + +/* + * CPU debug state control: + */ + +#ifdef CONFIG_SMP +static void kgdb_wait(struct pt_regs *regs) +{ + unsigned long flags; + int cpu; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + kgdb_info[cpu].debuggerinfo = regs; + kgdb_info[cpu].task = current; + /* + * Make sure the above info reaches the primary CPU before + * our cpu_in_kgdb[] flag setting does: + */ + smp_wmb(); + atomic_set(&cpu_in_kgdb[cpu], 1); + + /* Wait till primary CPU is done with debugging */ + while (atomic_read(&passive_cpu_wait[cpu])) + cpu_relax(); + + kgdb_info[cpu].debuggerinfo = NULL; + kgdb_info[cpu].task = NULL; + + /* fix up hardware debug registers on local cpu */ + if (arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + + /* Signal the primary CPU that we are done: */ + atomic_set(&cpu_in_kgdb[cpu], 0); + clocksource_touch_watchdog(); + local_irq_restore(flags); +} +#endif + +/* + * Some architectures need cache flushes when we set/clear a + * breakpoint: + */ +static void kgdb_flush_swbreak_addr(unsigned long addr) +{ + if (!CACHE_FLUSH_IS_SAFE) + return; + + if (current->mm && current->mm->mmap_cache) { + flush_cache_range(current->mm->mmap_cache, + addr, addr + BREAK_INSTR_SIZE); + } + /* Force flush instruction cache if it was outside the mm */ + flush_icache_range(addr, addr + BREAK_INSTR_SIZE); +} + +/* + * SW breakpoint management: + */ +static int kgdb_activate_sw_breakpoints(void) +{ + unsigned long addr; + int error = 0; + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_SET) + continue; + + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_set_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) + return error; + + kgdb_flush_swbreak_addr(addr); + kgdb_break[i].state = BP_ACTIVE; + } + return 0; +} + +static int kgdb_set_sw_break(unsigned long addr) +{ + int err = kgdb_validate_break_address(addr); + int breakno = -1; + int i; + + if (err) + return err; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_SET) && + (kgdb_break[i].bpt_addr == addr)) + return -EEXIST; + } + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_REMOVED && + kgdb_break[i].bpt_addr == addr) { + breakno = i; + break; + } + } + + if (breakno == -1) { + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_UNDEFINED) { + breakno = i; + break; + } + } + } + + if (breakno == -1) + return -E2BIG; + + kgdb_break[breakno].state = BP_SET; + kgdb_break[breakno].type = BP_BREAKPOINT; + kgdb_break[breakno].bpt_addr = addr; + + return 0; +} + +static int kgdb_deactivate_sw_breakpoints(void) +{ + unsigned long addr; + int error = 0; + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_ACTIVE) + continue; + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_remove_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) + return error; + + kgdb_flush_swbreak_addr(addr); + kgdb_break[i].state = BP_SET; + } + return 0; +} + +static int kgdb_remove_sw_break(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_SET) && + (kgdb_break[i].bpt_addr == addr)) { + kgdb_break[i].state = BP_REMOVED; + return 0; + } + } + return -ENOENT; +} + +int kgdb_isremovedbreak(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_REMOVED) && + (kgdb_break[i].bpt_addr == addr)) + return 1; + } + return 0; +} + +int remove_all_break(void) +{ + unsigned long addr; + int error; + int i; + + /* Clear memory breakpoints. */ + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_ACTIVE) + goto setundefined; + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_remove_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) + printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", + addr); +setundefined: + kgdb_break[i].state = BP_UNDEFINED; + } + + /* Clear hardware breakpoints. */ + if (arch_kgdb_ops.remove_all_hw_break) + arch_kgdb_ops.remove_all_hw_break(); + + return 0; +} + +/* + * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs: + */ +static inline int shadow_pid(int realpid) +{ + if (realpid) + return realpid; + + return -1-raw_smp_processor_id(); +} + +static char gdbmsgbuf[BUFMAX + 1]; + +static void kgdb_msg_write(const char *s, int len) +{ + char *bufptr; + int wcount; + int i; + + /* 'O'utput */ + gdbmsgbuf[0] = 'O'; + + /* Fill and send buffers... */ + while (len > 0) { + bufptr = gdbmsgbuf + 1; + + /* Calculate how many this time */ + if ((len << 1) > (BUFMAX - 2)) + wcount = (BUFMAX - 2) >> 1; + else + wcount = len; + + /* Pack in hex chars */ + for (i = 0; i < wcount; i++) + bufptr = pack_hex_byte(bufptr, s[i]); + *bufptr = '\0'; + + /* Move up */ + s += wcount; + len -= wcount; + + /* Write packet */ + put_packet(gdbmsgbuf); + } +} + +/* + * Return true if there is a valid kgdb I/O module. Also if no + * debugger is attached a message can be printed to the console about + * waiting for the debugger to attach. + * + * The print_wait argument is only to be true when called from inside + * the core kgdb_handle_exception, because it will wait for the + * debugger to attach. + */ +static int kgdb_io_ready(int print_wait) +{ + if (!kgdb_io_ops) + return 0; + if (kgdb_connected) + return 1; + if (atomic_read(&kgdb_setting_breakpoint)) + return 1; + if (print_wait) + printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); + return 1; +} + +/* + * All the functions that start with gdb_cmd are the various + * operations to implement the handlers for the gdbserial protocol + * where KGDB is communicating with an external debugger + */ + +/* Handle the '?' status packets */ +static void gdb_cmd_status(struct kgdb_state *ks) +{ + /* + * We know that this packet is only sent + * during initial connect. So to be safe, + * we clear out our breakpoints now in case + * GDB is reconnecting. + */ + remove_all_break(); + + remcom_out_buffer[0] = 'S'; + pack_hex_byte(&remcom_out_buffer[1], ks->signo); +} + +/* Handle the 'g' get registers request */ +static void gdb_cmd_getregs(struct kgdb_state *ks) +{ + struct task_struct *thread; + void *local_debuggerinfo; + int i; + + thread = kgdb_usethread; + if (!thread) { + thread = kgdb_info[ks->cpu].task; + local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; + } else { + local_debuggerinfo = NULL; + for (i = 0; i < NR_CPUS; i++) { + /* + * Try to find the task on some other + * or possibly this node if we do not + * find the matching task then we try + * to approximate the results. + */ + if (thread == kgdb_info[i].task) + local_debuggerinfo = kgdb_info[i].debuggerinfo; + } + } + + /* + * All threads that don't have debuggerinfo should be + * in __schedule() sleeping, since all other CPUs + * are in kgdb_wait, and thus have debuggerinfo. + */ + if (local_debuggerinfo) { + pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo); + } else { + /* + * Pull stuff saved during switch_to; nothing + * else is accessible (or even particularly + * relevant). + * + * This should be enough for a stack trace. + */ + sleeping_thread_to_gdb_regs(gdb_regs, thread); + } + kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); +} + +/* Handle the 'G' set registers request */ +static void gdb_cmd_setregs(struct kgdb_state *ks) +{ + kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES); + + if (kgdb_usethread && kgdb_usethread != current) { + error_packet(remcom_out_buffer, -EINVAL); + } else { + gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); + strcpy(remcom_out_buffer, "OK"); + } +} + +/* Handle the 'm' memory read bytes */ +static void gdb_cmd_memread(struct kgdb_state *ks) +{ + char *ptr = &remcom_in_buffer[1]; + unsigned long length; + unsigned long addr; + int err; + + if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && + kgdb_hex2long(&ptr, &length) > 0) { + err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); + if (err) + error_packet(remcom_out_buffer, err); + } else { + error_packet(remcom_out_buffer, -EINVAL); + } +} + +/* Handle the 'M' memory write bytes */ +static void gdb_cmd_memwrite(struct kgdb_state *ks) +{ + int err = write_mem_msg(0); + + if (err) + error_packet(remcom_out_buffer, err); + else + strcpy(remcom_out_buffer, "OK"); +} + +/* Handle the 'X' memory binary write bytes */ +static void gdb_cmd_binwrite(struct kgdb_state *ks) +{ + int err = write_mem_msg(1); + + if (err) + error_packet(remcom_out_buffer, err); + else + strcpy(remcom_out_buffer, "OK"); +} + +/* Handle the 'D' or 'k', detach or kill packets */ +static void gdb_cmd_detachkill(struct kgdb_state *ks) +{ + int error; + + /* The detach case */ + if (remcom_in_buffer[0] == 'D') { + error = remove_all_break(); + if (error < 0) { + error_packet(remcom_out_buffer, error); + } else { + strcpy(remcom_out_buffer, "OK"); + kgdb_connected = 0; + } + put_packet(remcom_out_buffer); + } else { + /* + * Assume the kill case, with no exit code checking, + * trying to force detach the debugger: + */ + remove_all_break(); + kgdb_connected = 0; + } +} + +/* Handle the 'R' reboot packets */ +static int gdb_cmd_reboot(struct kgdb_state *ks) +{ + /* For now, only honor R0 */ + if (strcmp(remcom_in_buffer, "R0") == 0) { + printk(KERN_CRIT "Executing emergency reboot\n"); + strcpy(remcom_out_buffer, "OK"); + put_packet(remcom_out_buffer); + + /* + * Execution should not return from + * machine_emergency_restart() + */ + machine_emergency_restart(); + kgdb_connected = 0; + + return 1; + } + return 0; +} + +/* Handle the 'q' query packets */ +static void gdb_cmd_query(struct kgdb_state *ks) +{ + struct task_struct *thread; + unsigned char thref[8]; + char *ptr; + int i; + + switch (remcom_in_buffer[1]) { + case 's': + case 'f': + if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + + if (remcom_in_buffer[1] == 'f') + ks->threadid = 1; + + remcom_out_buffer[0] = 'm'; + ptr = remcom_out_buffer + 1; + + for (i = 0; i < 17; ks->threadid++) { + thread = getthread(ks->linux_regs, ks->threadid); + if (thread) { + int_to_threadref(thref, ks->threadid); + pack_threadid(ptr, thref); + ptr += BUF_THREAD_ID_SIZE; + *(ptr++) = ','; + i++; + } + } + *(--ptr) = '\0'; + break; + + case 'C': + /* Current thread id */ + strcpy(remcom_out_buffer, "QC"); + ks->threadid = shadow_pid(current->pid); + int_to_threadref(thref, ks->threadid); + pack_threadid(remcom_out_buffer + 2, thref); + break; + case 'T': + if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + ks->threadid = 0; + ptr = remcom_in_buffer + 17; + kgdb_hex2long(&ptr, &ks->threadid); + if (!getthread(ks->linux_regs, ks->threadid)) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + if (ks->threadid > 0) { + kgdb_mem2hex(getthread(ks->linux_regs, + ks->threadid)->comm, + remcom_out_buffer, 16); + } else { + static char tmpstr[23 + BUF_THREAD_ID_SIZE]; + + sprintf(tmpstr, "Shadow task %d for pid 0", + (int)(-ks->threadid-1)); + kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); + } + break; + } +} + +/* Handle the 'H' task query packets */ +static void gdb_cmd_task(struct kgdb_state *ks) +{ + struct task_struct *thread; + char *ptr; + + switch (remcom_in_buffer[1]) { + case 'g': + ptr = &remcom_in_buffer[2]; + kgdb_hex2long(&ptr, &ks->threadid); + thread = getthread(ks->linux_regs, ks->threadid); + if (!thread && ks->threadid > 0) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_usethread = thread; + ks->kgdb_usethreadid = ks->threadid; + strcpy(remcom_out_buffer, "OK"); + break; + case 'c': + ptr = &remcom_in_buffer[2]; + kgdb_hex2long(&ptr, &ks->threadid); + if (!ks->threadid) { + kgdb_contthread = NULL; + } else { + thread = getthread(ks->linux_regs, ks->threadid); + if (!thread && ks->threadid > 0) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_contthread = thread; + } + strcpy(remcom_out_buffer, "OK"); + break; + } +} + +/* Handle the 'T' thread query packets */ +static void gdb_cmd_thread(struct kgdb_state *ks) +{ + char *ptr = &remcom_in_buffer[1]; + struct task_struct *thread; + + kgdb_hex2long(&ptr, &ks->threadid); + thread = getthread(ks->linux_regs, ks->threadid); + if (thread) + strcpy(remcom_out_buffer, "OK"); + else + error_packet(remcom_out_buffer, -EINVAL); +} + +/* Handle the 'z' or 'Z' breakpoint remove or set packets */ +static void gdb_cmd_break(struct kgdb_state *ks) +{ + /* + * Since GDB-5.3, it's been drafted that '0' is a software + * breakpoint, '1' is a hardware breakpoint, so let's do that. + */ + char *bpt_type = &remcom_in_buffer[1]; + char *ptr = &remcom_in_buffer[2]; + unsigned long addr; + unsigned long length; + int error = 0; + + if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') { + /* Unsupported */ + if (*bpt_type > '4') + return; + } else { + if (*bpt_type != '0' && *bpt_type != '1') + /* Unsupported. */ + return; + } + + /* + * Test if this is a hardware breakpoint, and + * if we support it: + */ + if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)) + /* Unsupported. */ + return; + + if (*(ptr++) != ',') { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + if (!kgdb_hex2long(&ptr, &addr)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + if (*(ptr++) != ',' || + !kgdb_hex2long(&ptr, &length)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + + if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') + error = kgdb_set_sw_break(addr); + else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') + error = kgdb_remove_sw_break(addr); + else if (remcom_in_buffer[0] == 'Z') + error = arch_kgdb_ops.set_hw_breakpoint(addr, + (int)length, *bpt_type - '0'); + else if (remcom_in_buffer[0] == 'z') + error = arch_kgdb_ops.remove_hw_breakpoint(addr, + (int) length, *bpt_type - '0'); + + if (error == 0) + strcpy(remcom_out_buffer, "OK"); + else + error_packet(remcom_out_buffer, error); +} + +/* Handle the 'C' signal / exception passing packets */ +static int gdb_cmd_exception_pass(struct kgdb_state *ks) +{ + /* C09 == pass exception + * C15 == detach kgdb, pass exception + */ + if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') { + + ks->pass_exception = 1; + remcom_in_buffer[0] = 'c'; + + } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') { + + ks->pass_exception = 1; + remcom_in_buffer[0] = 'D'; + remove_all_break(); + kgdb_connected = 0; + return 1; + + } else { + error_packet(remcom_out_buffer, -EINVAL); + return 0; + } + + /* Indicate fall through */ + return -1; +} + +/* + * This function performs all gdbserial command procesing + */ +static int gdb_serial_stub(struct kgdb_state *ks) +{ + int error = 0; + int tmp; + + /* Clear the out buffer. */ + memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + + if (kgdb_connected) { + unsigned char thref[8]; + char *ptr; + + /* Reply to host that an exception has occurred */ + ptr = remcom_out_buffer; + *ptr++ = 'T'; + ptr = pack_hex_byte(ptr, ks->signo); + ptr += strlen(strcpy(ptr, "thread:")); + int_to_threadref(thref, shadow_pid(current->pid)); + ptr = pack_threadid(ptr, thref); + *ptr++ = ';'; + put_packet(remcom_out_buffer); + } + + kgdb_usethread = kgdb_info[ks->cpu].task; + ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); + ks->pass_exception = 0; + + while (1) { + error = 0; + + /* Clear the out buffer. */ + memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + + get_packet(remcom_in_buffer); + + switch (remcom_in_buffer[0]) { + case '?': /* gdbserial status */ + gdb_cmd_status(ks); + break; + case 'g': /* return the value of the CPU registers */ + gdb_cmd_getregs(ks); + break; + case 'G': /* set the value of the CPU registers - return OK */ + gdb_cmd_setregs(ks); + break; + case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + gdb_cmd_memread(ks); + break; + case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ + gdb_cmd_memwrite(ks); + break; + case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ + gdb_cmd_binwrite(ks); + break; + /* kill or detach. KGDB should treat this like a + * continue. + */ + case 'D': /* Debugger detach */ + case 'k': /* Debugger detach via kill */ + gdb_cmd_detachkill(ks); + goto default_handle; + case 'R': /* Reboot */ + if (gdb_cmd_reboot(ks)) + goto default_handle; + break; + case 'q': /* query command */ + gdb_cmd_query(ks); + break; + case 'H': /* task related */ + gdb_cmd_task(ks); + break; + case 'T': /* Query thread status */ + gdb_cmd_thread(ks); + break; + case 'z': /* Break point remove */ + case 'Z': /* Break point set */ + gdb_cmd_break(ks); + break; + case 'C': /* Exception passing */ + tmp = gdb_cmd_exception_pass(ks); + if (tmp > 0) + goto default_handle; + if (tmp == 0) + break; + /* Fall through on tmp < 0 */ + case 'c': /* Continue packet */ + case 's': /* Single step packet */ + if (kgdb_contthread && kgdb_contthread != current) { + /* Can't switch threads in kgdb */ + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_activate_sw_breakpoints(); + /* Fall through to default processing */ + default: +default_handle: + error = kgdb_arch_handle_exception(ks->ex_vector, + ks->signo, + ks->err_code, + remcom_in_buffer, + remcom_out_buffer, + ks->linux_regs); + /* + * Leave cmd processing on error, detach, + * kill, continue, or single step. + */ + if (error >= 0 || remcom_in_buffer[0] == 'D' || + remcom_in_buffer[0] == 'k') { + error = 0; + goto kgdb_exit; + } + + } + + /* reply to the request */ + put_packet(remcom_out_buffer); + } + +kgdb_exit: + if (ks->pass_exception) + error = 1; + return error; +} + +static int kgdb_reenter_check(struct kgdb_state *ks) +{ + unsigned long addr; + + if (atomic_read(&kgdb_active) != raw_smp_processor_id()) + return 0; + + /* Panic on recursive debugger calls: */ + exception_level++; + addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); + kgdb_deactivate_sw_breakpoints(); + + /* + * If the break point removed ok at the place exception + * occurred, try to recover and print a warning to the end + * user because the user planted a breakpoint in a place that + * KGDB needs in order to function. + */ + if (kgdb_remove_sw_break(addr) == 0) { + exception_level = 0; + kgdb_skipexception(ks->ex_vector, ks->linux_regs); + kgdb_activate_sw_breakpoints(); + printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", + addr); + WARN_ON_ONCE(1); + + return 1; + } + remove_all_break(); + kgdb_skipexception(ks->ex_vector, ks->linux_regs); + + if (exception_level > 1) { + dump_stack(); + panic("Recursive entry to debugger"); + } + + printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); + dump_stack(); + panic("Recursive entry to debugger"); + + return 1; +} + +/* + * kgdb_handle_exception() - main entry point from a kernel exception + * + * Locking hierarchy: + * interface locks, if any (begin_session) + * kgdb lock (kgdb_active) + */ +int +kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) +{ + struct kgdb_state kgdb_var; + struct kgdb_state *ks = &kgdb_var; + unsigned long flags; + int error = 0; + int i, cpu; + + ks->cpu = raw_smp_processor_id(); + ks->ex_vector = evector; + ks->signo = signo; + ks->ex_vector = evector; + ks->err_code = ecode; + ks->kgdb_usethreadid = 0; + ks->linux_regs = regs; + + if (kgdb_reenter_check(ks)) + return 0; /* Ouch, double exception ! */ + +acquirelock: + /* + * Interrupts will be restored by the 'trap return' code, except when + * single stepping. + */ + local_irq_save(flags); + + cpu = raw_smp_processor_id(); + + /* + * Acquire the kgdb_active lock: + */ + while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) + cpu_relax(); + + /* + * Do not start the debugger connection on this CPU if the last + * instance of the exception handler wanted to come into the + * debugger on a different CPU via a single step + */ + if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && + atomic_read(&kgdb_cpu_doing_single_step) != cpu) { + + atomic_set(&kgdb_active, -1); + clocksource_touch_watchdog(); + local_irq_restore(flags); + + goto acquirelock; + } + + if (!kgdb_io_ready(1)) { + error = 1; + goto kgdb_restore; /* No I/O connection, so resume the system */ + } + + /* + * Don't enter if we have hit a removed breakpoint. + */ + if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) + goto kgdb_restore; + + /* Call the I/O driver's pre_exception routine */ + if (kgdb_io_ops->pre_exception) + kgdb_io_ops->pre_exception(); + + kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs; + kgdb_info[ks->cpu].task = current; + + kgdb_disable_hw_debug(ks->linux_regs); + + /* + * Get the passive CPU lock which will hold all the non-primary + * CPU in a spin state while the debugger is active + */ + if (!kgdb_single_step || !kgdb_contthread) { + for (i = 0; i < NR_CPUS; i++) + atomic_set(&passive_cpu_wait[i], 1); + } + + /* + * spin_lock code is good enough as a barrier so we don't + * need one here: + */ + atomic_set(&cpu_in_kgdb[ks->cpu], 1); + +#ifdef CONFIG_SMP + /* Signal the other CPUs to enter kgdb_wait() */ + if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) + kgdb_roundup_cpus(flags); +#endif + + /* + * Wait for the other CPUs to be notified and be waiting for us: + */ + for_each_online_cpu(i) { + while (!atomic_read(&cpu_in_kgdb[i])) + cpu_relax(); + } + + /* + * At this point the primary processor is completely + * in the debugger and all secondary CPUs are quiescent + */ + kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); + kgdb_deactivate_sw_breakpoints(); + kgdb_single_step = 0; + kgdb_contthread = NULL; + exception_level = 0; + + /* Talk to debugger with gdbserial protocol */ + error = gdb_serial_stub(ks); + + /* Call the I/O driver's post_exception routine */ + if (kgdb_io_ops->post_exception) + kgdb_io_ops->post_exception(); + + kgdb_info[ks->cpu].debuggerinfo = NULL; + kgdb_info[ks->cpu].task = NULL; + atomic_set(&cpu_in_kgdb[ks->cpu], 0); + + if (!kgdb_single_step || !kgdb_contthread) { + for (i = NR_CPUS-1; i >= 0; i--) + atomic_set(&passive_cpu_wait[i], 0); + /* + * Wait till all the CPUs have quit + * from the debugger. + */ + for_each_online_cpu(i) { + while (atomic_read(&cpu_in_kgdb[i])) + cpu_relax(); + } + } + +kgdb_restore: + /* Free kgdb_active */ + atomic_set(&kgdb_active, -1); + clocksource_touch_watchdog(); + local_irq_restore(flags); + + return error; +} + +int kgdb_nmicallback(int cpu, void *regs) +{ +#ifdef CONFIG_SMP + if (!atomic_read(&cpu_in_kgdb[cpu]) && + atomic_read(&kgdb_active) != cpu && + atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { + kgdb_wait((struct pt_regs *)regs); + return 0; + } +#endif + return 1; +} + +void kgdb_console_write(struct console *co, const char *s, unsigned count) +{ + unsigned long flags; + + /* If we're debugging, or KGDB has not connected, don't try + * and print. */ + if (!kgdb_connected || atomic_read(&kgdb_active) != -1) + return; + + local_irq_save(flags); + kgdb_msg_write(s, count); + local_irq_restore(flags); +} + +static struct console kgdbcons = { + .name = "kgdb", + .write = kgdb_console_write, + .flags = CON_PRINTBUFFER | CON_ENABLED, + .index = -1, +}; + +#ifdef CONFIG_MAGIC_SYSRQ +static void sysrq_handle_gdb(int key, struct tty_struct *tty) +{ + if (!kgdb_io_ops) { + printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); + return; + } + if (!kgdb_connected) + printk(KERN_CRIT "Entering KGDB\n"); + + kgdb_breakpoint(); +} + +static struct sysrq_key_op sysrq_gdb_op = { + .handler = sysrq_handle_gdb, + .help_msg = "Gdb", + .action_msg = "GDB", +}; +#endif + +static void kgdb_register_callbacks(void) +{ + if (!kgdb_io_module_registered) { + kgdb_io_module_registered = 1; + kgdb_arch_init(); +#ifdef CONFIG_MAGIC_SYSRQ + register_sysrq_key('g', &sysrq_gdb_op); +#endif + if (kgdb_use_con && !kgdb_con_registered) { + register_console(&kgdbcons); + kgdb_con_registered = 1; + } + } +} + +static void kgdb_unregister_callbacks(void) +{ + /* + * When this routine is called KGDB should unregister from the + * panic handler and clean up, making sure it is not handling any + * break exceptions at the time. + */ + if (kgdb_io_module_registered) { + kgdb_io_module_registered = 0; + kgdb_arch_exit(); +#ifdef CONFIG_MAGIC_SYSRQ + unregister_sysrq_key('g', &sysrq_gdb_op); +#endif + if (kgdb_con_registered) { + unregister_console(&kgdbcons); + kgdb_con_registered = 0; + } + } +} + +static void kgdb_initial_breakpoint(void) +{ + kgdb_break_asap = 0; + + printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); + kgdb_breakpoint(); +} + +/** + * kgdb_register_io_module - register KGDB IO module + * @new_kgdb_io_ops: the io ops vector + * + * Register it with the KGDB core. + */ +int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops) +{ + int err; + + spin_lock(&kgdb_registration_lock); + + if (kgdb_io_ops) { + spin_unlock(&kgdb_registration_lock); + + printk(KERN_ERR "kgdb: Another I/O driver is already " + "registered with KGDB.\n"); + return -EBUSY; + } + + if (new_kgdb_io_ops->init) { + err = new_kgdb_io_ops->init(); + if (err) { + spin_unlock(&kgdb_registration_lock); + return err; + } + } + + kgdb_io_ops = new_kgdb_io_ops; + + spin_unlock(&kgdb_registration_lock); + + printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", + new_kgdb_io_ops->name); + + /* Arm KGDB now. */ + kgdb_register_callbacks(); + + if (kgdb_break_asap) + kgdb_initial_breakpoint(); + + return 0; +} +EXPORT_SYMBOL_GPL(kgdb_register_io_module); + +/** + * kkgdb_unregister_io_module - unregister KGDB IO module + * @old_kgdb_io_ops: the io ops vector + * + * Unregister it with the KGDB core. + */ +void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops) +{ + BUG_ON(kgdb_connected); + + /* + * KGDB is no longer able to communicate out, so + * unregister our callbacks and reset state. + */ + kgdb_unregister_callbacks(); + + spin_lock(&kgdb_registration_lock); + + WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops); + kgdb_io_ops = NULL; + + spin_unlock(&kgdb_registration_lock); + + printk(KERN_INFO + "kgdb: Unregistered I/O driver %s, debugger disabled.\n", + old_kgdb_io_ops->name); +} +EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); + +/** + * kgdb_breakpoint - generate breakpoint exception + * + * This function will generate a breakpoint exception. It is used at the + * beginning of a program to sync up with a debugger and can be used + * otherwise as a quick means to stop program execution and "break" into + * the debugger. + */ +void kgdb_breakpoint(void) +{ + atomic_set(&kgdb_setting_breakpoint, 1); + wmb(); /* Sync point before breakpoint */ + arch_kgdb_breakpoint(); + wmb(); /* Sync point after breakpoint */ + atomic_set(&kgdb_setting_breakpoint, 0); +} +EXPORT_SYMBOL_GPL(kgdb_breakpoint); + +static int __init opt_kgdb_wait(char *str) +{ + kgdb_break_asap = 1; + + if (kgdb_io_module_registered) + kgdb_initial_breakpoint(); + + return 0; +} + +early_param("kgdbwait", opt_kgdb_wait); diff --git a/kernel/kmod.c b/kernel/kmod.c index bb7df2a28bd7..22be3ff3f363 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -173,10 +173,7 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); - retval = -EPERM; - if (current->fs->root) - retval = kernel_execve(sub_info->path, - sub_info->argv, sub_info->envp); + retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); /* Exec failed? */ sub_info->retval = retval; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7a86e6432338..fcfb580c3afc 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -498,27 +498,36 @@ static int __kprobes in_kprobes_functions(unsigned long addr) return 0; } +/* + * If we have a symbol_name argument, look it up and add the offset field + * to it. This way, we can specify a relative address to a symbol. + */ +static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) +{ + kprobe_opcode_t *addr = p->addr; + if (p->symbol_name) { + if (addr) + return NULL; + kprobe_lookup_name(p->symbol_name, addr); + } + + if (!addr) + return NULL; + return (kprobe_opcode_t *)(((char *)addr) + p->offset); +} + static int __kprobes __register_kprobe(struct kprobe *p, unsigned long called_from) { int ret = 0; struct kprobe *old_p; struct module *probed_mod; + kprobe_opcode_t *addr; - /* - * If we have a symbol_name argument look it up, - * and add it to the address. That way the addr - * field can either be global or relative to a symbol. - */ - if (p->symbol_name) { - if (p->addr) - return -EINVAL; - kprobe_lookup_name(p->symbol_name, p->addr); - } - - if (!p->addr) + addr = kprobe_addr(p); + if (!addr) return -EINVAL; - p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); + p->addr = addr; if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) @@ -678,8 +687,7 @@ void __kprobes unregister_jprobe(struct jprobe *jp) unregister_kprobe(&jp->kp); } -#ifdef ARCH_SUPPORTS_KRETPROBES - +#ifdef CONFIG_KRETPROBES /* * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. @@ -722,12 +730,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp) int ret = 0; struct kretprobe_instance *inst; int i; - void *addr = rp->kp.addr; + void *addr; if (kretprobe_blacklist_size) { - if (addr == NULL) - kprobe_lookup_name(rp->kp.symbol_name, addr); - addr += rp->kp.offset; + addr = kprobe_addr(&rp->kp); + if (!addr) + return -EINVAL; for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { if (kretprobe_blacklist[i].addr == addr) @@ -769,8 +777,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) return ret; } -#else /* ARCH_SUPPORTS_KRETPROBES */ - +#else /* CONFIG_KRETPROBES */ int __kprobes register_kretprobe(struct kretprobe *rp) { return -ENOSYS; @@ -781,8 +788,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, { return 0; } - -#endif /* ARCH_SUPPORTS_KRETPROBES */ +#endif /* CONFIG_KRETPROBES */ void __kprobes unregister_kretprobe(struct kretprobe *rp) { diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3574379f4d62..81a4e4a3f087 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -779,6 +779,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * parallel walking of the hash-list safe: */ list_add_tail_rcu(&class->hash_entry, hash_head); + /* + * Add it to the global list of classes: + */ + list_add_tail_rcu(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); @@ -2282,10 +2286,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; break; case LOCK_USED: - /* - * Add it to the global list of classes: - */ - list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); debug_atomic_dec(&nr_unused_locks); break; default: diff --git a/kernel/marker.c b/kernel/marker.c index 5323cfaedbce..005b95954593 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -27,35 +27,42 @@ extern struct marker __start___markers[]; extern struct marker __stop___markers[]; +/* Set to 1 to enable marker debug output */ +const int marker_debug; + /* * markers_mutex nests inside module_mutex. Markers mutex protects the builtin - * and module markers, the hash table and deferred_sync. + * and module markers and the hash table. */ static DEFINE_MUTEX(markers_mutex); /* - * Marker deferred synchronization. - * Upon marker probe_unregister, we delay call to synchronize_sched() to - * accelerate mass unregistration (only when there is no more reference to a - * given module do we call synchronize_sched()). However, we need to make sure - * every critical region has ended before we re-arm a marker that has been - * unregistered and then registered back with a different probe data. - */ -static int deferred_sync; - -/* * Marker hash table, containing the active markers. * Protected by module_mutex. */ #define MARKER_HASH_BITS 6 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) +/* + * Note about RCU : + * It is used to make sure every handler has finished using its private data + * between two consecutive operation (add or remove) on a given marker. It is + * also used to delay the free of multiple probes array until a quiescent state + * is reached. + * marker entries modifications are protected by the markers_mutex. + */ struct marker_entry { struct hlist_node hlist; char *format; - marker_probe_func *probe; - void *private; + void (*call)(const struct marker *mdata, /* Probe wrapper */ + void *call_private, const char *fmt, ...); + struct marker_probe_closure single; + struct marker_probe_closure *multi; int refcount; /* Number of times armed. 0 if disarmed. */ + struct rcu_head rcu; + void *oldptr; + unsigned char rcu_pending:1; + unsigned char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; @@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE]; /** * __mark_empty_function - Empty probe callback - * @mdata: pointer of type const struct marker + * @probe_private: probe private data + * @call_private: call site private data * @fmt: format string * @...: variable argument list * @@ -72,13 +80,267 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE]; * though the function pointer change and the marker enabling are two distinct * operations that modifies the execution flow of preemptible code. */ -void __mark_empty_function(const struct marker *mdata, void *private, - const char *fmt, ...) +void __mark_empty_function(void *probe_private, void *call_private, + const char *fmt, va_list *args) { } EXPORT_SYMBOL_GPL(__mark_empty_function); /* + * marker_probe_cb Callback that prepares the variable argument list for probes. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @fmt: format string + * @...: Variable argument list. + * + * Since we do not use "typical" pointer based RCU in the 1 argument case, we + * need to put a full smp_rmb() in this branch. This is why we do not use + * rcu_dereference() for the pointer read. + */ +void marker_probe_cb(const struct marker *mdata, void *call_private, + const char *fmt, ...) +{ + va_list args; + char ptype; + + /* + * preempt_disable does two things : disabling preemption to make sure + * the teardown of the callbacks can be done correctly when they are in + * modules and they insure RCU read coherency. + */ + preempt_disable(); + ptype = mdata->ptype; + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = mdata->single.func; + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + va_start(args, fmt); + func(mdata->single.probe_private, call_private, fmt, &args); + va_end(args); + } else { + struct marker_probe_closure *multi; + int i; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + multi = mdata->multi; + for (i = 0; multi[i].func; i++) { + va_start(args, fmt); + multi[i].func(multi[i].probe_private, call_private, fmt, + &args); + va_end(args); + } + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb); + +/* + * marker_probe_cb Callback that does not prepare the variable argument list. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @fmt: format string + * @...: Variable argument list. + * + * Should be connected to markers "MARK_NOARGS". + */ +void marker_probe_cb_noarg(const struct marker *mdata, + void *call_private, const char *fmt, ...) +{ + va_list args; /* not initialized */ + char ptype; + + preempt_disable(); + ptype = mdata->ptype; + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = mdata->single.func; + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + func(mdata->single.probe_private, call_private, fmt, &args); + } else { + struct marker_probe_closure *multi; + int i; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + multi = mdata->multi; + for (i = 0; multi[i].func; i++) + multi[i].func(multi[i].probe_private, call_private, fmt, + &args); + } + preempt_enable(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); + +static void free_old_closure(struct rcu_head *head) +{ + struct marker_entry *entry = container_of(head, + struct marker_entry, rcu); + kfree(entry->oldptr); + /* Make sure we free the data before setting the pending flag to 0 */ + smp_wmb(); + entry->rcu_pending = 0; +} + +static void debug_print_probes(struct marker_entry *entry) +{ + int i; + + if (!marker_debug) + return; + + if (!entry->ptype) { + printk(KERN_DEBUG "Single probe : %p %p\n", + entry->single.func, + entry->single.probe_private); + } else { + for (i = 0; entry->multi[i].func; i++) + printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, + entry->multi[i].func, + entry->multi[i].probe_private); + } +} + +static struct marker_probe_closure * +marker_entry_add_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0; + struct marker_probe_closure *old, *new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->multi; + if (!entry->ptype) { + if (entry->single.func == probe && + entry->single.probe_private == probe_private) + return ERR_PTR(-EBUSY); + if (entry->single.func == __mark_empty_function) { + /* 0 -> 1 probes */ + entry->single.func = probe; + entry->single.probe_private = probe_private; + entry->refcount = 1; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* 1 -> 2 probes */ + nr_probes = 1; + old = NULL; + } + } else { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) + if (old[nr_probes].func == probe + && old[nr_probes].probe_private + == probe_private) + return ERR_PTR(-EBUSY); + } + /* + 2 : one for new probe, one for NULL func */ + new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure), + GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (!old) + new[0] = entry->single; + else + memcpy(new, old, + nr_probes * sizeof(struct marker_probe_closure)); + new[nr_probes].func = probe; + new[nr_probes].probe_private = probe_private; + entry->refcount = nr_probes + 1; + entry->multi = new; + entry->ptype = 1; + debug_print_probes(entry); + return old; +} + +static struct marker_probe_closure * +marker_entry_remove_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0, nr_del = 0, i; + struct marker_probe_closure *old, *new; + + old = entry->multi; + + debug_print_probes(entry); + if (!entry->ptype) { + /* 0 -> N is an error */ + WARN_ON(entry->single.func == __mark_empty_function); + /* 1 -> 0 probes */ + WARN_ON(probe && entry->single.func != probe); + WARN_ON(entry->single.probe_private != probe_private); + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + if ((!probe || old[nr_probes].func == probe) + && old[nr_probes].probe_private + == probe_private) + nr_del++; + } + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + } else if (nr_probes - nr_del == 1) { + /* N -> 1, (N > 1) */ + for (i = 0; old[i].func; i++) + if ((probe && old[i].func != probe) || + old[i].probe_private != probe_private) + entry->single = old[i]; + entry->refcount = 1; + entry->ptype = 0; + } else { + int j = 0; + /* N -> M, (N > 1, M > 1) */ + /* + 1 for NULL */ + new = kzalloc((nr_probes - nr_del + 1) + * sizeof(struct marker_probe_closure), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old[i].func; i++) + if ((probe && old[i].func != probe) || + old[i].probe_private != probe_private) + new[j++] = old[i]; + entry->refcount = nr_probes - nr_del; + entry->ptype = 1; + entry->multi = new; + } + debug_print_probes(entry); + return old; +} + +/* * Get marker if the marker is present in the marker hash table. * Must be called with markers_mutex held. * Returns NULL if not present. @@ -102,8 +364,7 @@ static struct marker_entry *get_marker(const char *name) * Add the marker to the marker hash table. Must be called with markers_mutex * held. */ -static int add_marker(const char *name, const char *format, - marker_probe_func *probe, void *private) +static struct marker_entry *add_marker(const char *name, const char *format) { struct hlist_head *head; struct hlist_node *node; @@ -118,9 +379,8 @@ static int add_marker(const char *name, const char *format, hlist_for_each_entry(e, node, head, hlist) { if (!strcmp(name, e->name)) { printk(KERN_NOTICE - "Marker %s busy, probe %p already installed\n", - name, e->probe); - return -EBUSY; /* Already there */ + "Marker %s busy\n", name); + return ERR_PTR(-EBUSY); /* Already there */ } } /* @@ -130,34 +390,42 @@ static int add_marker(const char *name, const char *format, e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, GFP_KERNEL); if (!e) - return -ENOMEM; + return ERR_PTR(-ENOMEM); memcpy(&e->name[0], name, name_len); if (format) { e->format = &e->name[name_len]; memcpy(e->format, format, format_len); + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; trace_mark(core_marker_format, "name %s format %s", e->name, e->format); - } else + } else { e->format = NULL; - e->probe = probe; - e->private = private; + e->call = marker_probe_cb; + } + e->single.func = __mark_empty_function; + e->single.probe_private = NULL; + e->multi = NULL; + e->ptype = 0; e->refcount = 0; + e->rcu_pending = 0; hlist_add_head(&e->hlist, head); - return 0; + return e; } /* * Remove the marker from the marker hash table. Must be called with mutex_lock * held. */ -static void *remove_marker(const char *name) +static int remove_marker(const char *name) { struct hlist_head *head; struct hlist_node *node; struct marker_entry *e; int found = 0; size_t len = strlen(name) + 1; - void *private = NULL; u32 hash = jhash(name, len-1, 0); head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; @@ -167,12 +435,16 @@ static void *remove_marker(const char *name) break; } } - if (found) { - private = e->private; - hlist_del(&e->hlist); - kfree(e); - } - return private; + if (!found) + return -ENOENT; + if (e->single.func != __mark_empty_function) + return -EBUSY; + hlist_del(&e->hlist); + /* Make sure the call_rcu has been executed */ + if (e->rcu_pending) + rcu_barrier(); + kfree(e); + return 0; } /* @@ -184,6 +456,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format) size_t name_len = strlen((*entry)->name) + 1; size_t format_len = strlen(format) + 1; + e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, GFP_KERNEL); if (!e) @@ -191,11 +464,20 @@ static int marker_set_format(struct marker_entry **entry, const char *format) memcpy(&e->name[0], (*entry)->name, name_len); e->format = &e->name[name_len]; memcpy(e->format, format, format_len); - e->probe = (*entry)->probe; - e->private = (*entry)->private; + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; + e->single = (*entry)->single; + e->multi = (*entry)->multi; + e->ptype = (*entry)->ptype; e->refcount = (*entry)->refcount; + e->rcu_pending = 0; hlist_add_before(&e->hlist, &(*entry)->hlist); hlist_del(&(*entry)->hlist); + /* Make sure the call_rcu has been executed */ + if ((*entry)->rcu_pending) + rcu_barrier(); kfree(*entry); *entry = e; trace_mark(core_marker_format, "name %s format %s", @@ -206,7 +488,8 @@ static int marker_set_format(struct marker_entry **entry, const char *format) /* * Sets the probe callback corresponding to one marker. */ -static int set_marker(struct marker_entry **entry, struct marker *elem) +static int set_marker(struct marker_entry **entry, struct marker *elem, + int active) { int ret; WARN_ON(strcmp((*entry)->name, elem->name) != 0); @@ -226,26 +509,64 @@ static int set_marker(struct marker_entry **entry, struct marker *elem) if (ret) return ret; } - elem->call = (*entry)->probe; - elem->private = (*entry)->private; - elem->state = 1; + + /* + * probe_cb setup (statically known) is done here. It is + * asynchronous with the rest of execution, therefore we only + * pass from a "safe" callback (with argument) to an "unsafe" + * callback (does not set arguments). + */ + elem->call = (*entry)->call; + /* + * Sanity check : + * We only update the single probe private data when the ptr is + * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) + */ + WARN_ON(elem->single.func != __mark_empty_function + && elem->single.probe_private + != (*entry)->single.probe_private && + !elem->ptype); + elem->single.probe_private = (*entry)->single.probe_private; + /* + * Make sure the private data is valid when we update the + * single probe ptr. + */ + smp_wmb(); + elem->single.func = (*entry)->single.func; + /* + * We also make sure that the new probe callbacks array is consistent + * before setting a pointer to it. + */ + rcu_assign_pointer(elem->multi, (*entry)->multi); + /* + * Update the function or multi probe array pointer before setting the + * ptype. + */ + smp_wmb(); + elem->ptype = (*entry)->ptype; + elem->state = active; + return 0; } /* * Disable a marker and its probe callback. - * Note: only after a synchronize_sched() issued after setting elem->call to the - * empty function insures that the original callback is not used anymore. This - * insured by preemption disabling around the call site. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. */ static void disable_marker(struct marker *elem) { + /* leave "call" as is. It is known statically. */ elem->state = 0; - elem->call = __mark_empty_function; + elem->single.func = __mark_empty_function; + /* Update the function before setting the ptype */ + smp_wmb(); + elem->ptype = 0; /* single probe */ /* * Leave the private data and id there, because removal is racy and - * should be done only after a synchronize_sched(). These are never used - * until the next initialization anyway. + * should be done only after an RCU period. These are never used until + * the next initialization anyway. */ } @@ -253,14 +574,11 @@ static void disable_marker(struct marker *elem) * marker_update_probe_range - Update a probe range * @begin: beginning of the range * @end: end of the range - * @probe_module: module address of the probe being updated - * @refcount: number of references left to the given probe_module (out) * * Updates the probe callback corresponding to a range of markers. */ void marker_update_probe_range(struct marker *begin, - struct marker *end, struct module *probe_module, - int *refcount) + struct marker *end) { struct marker *iter; struct marker_entry *mark_entry; @@ -268,15 +586,12 @@ void marker_update_probe_range(struct marker *begin, mutex_lock(&markers_mutex); for (iter = begin; iter < end; iter++) { mark_entry = get_marker(iter->name); - if (mark_entry && mark_entry->refcount) { - set_marker(&mark_entry, iter); + if (mark_entry) { + set_marker(&mark_entry, iter, + !!mark_entry->refcount); /* * ignore error, continue */ - if (probe_module) - if (probe_module == - __module_text_address((unsigned long)mark_entry->probe)) - (*refcount)++; } else { disable_marker(iter); } @@ -286,23 +601,27 @@ void marker_update_probe_range(struct marker *begin, /* * Update probes, removing the faulty probes. - * Issues a synchronize_sched() when no reference to the module passed - * as parameter is found in the probes so the probe module can be - * safely unloaded from now on. + * + * Internal callback only changed before the first probe is connected to it. + * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 + * transitions. All other transitions will leave the old private data valid. + * This makes the non-atomicity of the callback/private data updates valid. + * + * "special case" updates : + * 0 -> 1 callback + * 1 -> 0 callback + * 1 -> 2 callbacks + * 2 -> 1 callbacks + * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. + * Site effect : marker_set_format may delete the marker entry (creating a + * replacement). */ -static void marker_update_probes(struct module *probe_module) +static void marker_update_probes(void) { - int refcount = 0; - /* Core kernel markers */ - marker_update_probe_range(__start___markers, - __stop___markers, probe_module, &refcount); + marker_update_probe_range(__start___markers, __stop___markers); /* Markers in modules. */ - module_update_markers(probe_module, &refcount); - if (probe_module && refcount == 0) { - synchronize_sched(); - deferred_sync = 0; - } + module_update_markers(); } /** @@ -310,33 +629,52 @@ static void marker_update_probes(struct module *probe_module) * @name: marker name * @format: format string * @probe: probe handler - * @private: probe private data + * @probe_private: probe private data * * private data must be a valid allocated memory address, or NULL. * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. */ int marker_probe_register(const char *name, const char *format, - marker_probe_func *probe, void *private) + marker_probe_func *probe, void *probe_private) { struct marker_entry *entry; int ret = 0; + struct marker_probe_closure *old; mutex_lock(&markers_mutex); entry = get_marker(name); - if (entry && entry->refcount) { - ret = -EBUSY; - goto end; - } - if (deferred_sync) { - synchronize_sched(); - deferred_sync = 0; + if (!entry) { + entry = add_marker(name, format); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto end; + } } - ret = add_marker(name, format, probe, private); - if (ret) + /* + * If we detect that a call_rcu is pending for this marker, + * make sure it's executed now. + */ + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_add_probe(entry, probe, probe_private); + if (IS_ERR(old)) { + ret = PTR_ERR(old); goto end; + } mutex_unlock(&markers_mutex); - marker_update_probes(NULL); - return ret; + marker_update_probes(); /* may update entry */ + mutex_lock(&markers_mutex); + entry = get_marker(name); + WARN_ON(!entry); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif + call_rcu(&entry->rcu, free_old_closure); end: mutex_unlock(&markers_mutex); return ret; @@ -346,171 +684,173 @@ EXPORT_SYMBOL_GPL(marker_probe_register); /** * marker_probe_unregister - Disconnect a probe from a marker * @name: marker name + * @probe: probe function pointer + * @probe_private: probe private data * * Returns the private data given to marker_probe_register, or an ERR_PTR(). + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. */ -void *marker_probe_unregister(const char *name) +int marker_probe_unregister(const char *name, + marker_probe_func *probe, void *probe_private) { - struct module *probe_module; struct marker_entry *entry; - void *private; + struct marker_probe_closure *old; + int ret = -ENOENT; mutex_lock(&markers_mutex); entry = get_marker(name); - if (!entry) { - private = ERR_PTR(-ENOENT); + if (!entry) goto end; - } - entry->refcount = 0; - /* In what module is the probe handler ? */ - probe_module = __module_text_address((unsigned long)entry->probe); - private = remove_marker(name); - deferred_sync = 1; + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_remove_probe(entry, probe, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(probe_module); - return private; + marker_update_probes(); /* may update entry */ + mutex_lock(&markers_mutex); + entry = get_marker(name); + if (!entry) + goto end; + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif + call_rcu(&entry->rcu, free_old_closure); + remove_marker(name); /* Ignore busy error message */ + ret = 0; end: mutex_unlock(&markers_mutex); - return private; + return ret; } EXPORT_SYMBOL_GPL(marker_probe_unregister); -/** - * marker_probe_unregister_private_data - Disconnect a probe from a marker - * @private: probe private data - * - * Unregister a marker by providing the registered private data. - * Returns the private data given to marker_probe_register, or an ERR_PTR(). - */ -void *marker_probe_unregister_private_data(void *private) +static struct marker_entry * +get_marker_from_private_data(marker_probe_func *probe, void *probe_private) { - struct module *probe_module; - struct hlist_head *head; - struct hlist_node *node; struct marker_entry *entry; - int found = 0; unsigned int i; + struct hlist_head *head; + struct hlist_node *node; - mutex_lock(&markers_mutex); for (i = 0; i < MARKER_TABLE_SIZE; i++) { head = &marker_table[i]; hlist_for_each_entry(entry, node, head, hlist) { - if (entry->private == private) { - found = 1; - goto iter_end; + if (!entry->ptype) { + if (entry->single.func == probe + && entry->single.probe_private + == probe_private) + return entry; + } else { + struct marker_probe_closure *closure; + closure = entry->multi; + for (i = 0; closure[i].func; i++) { + if (closure[i].func == probe && + closure[i].probe_private + == probe_private) + return entry; + } } } } -iter_end: - if (!found) { - private = ERR_PTR(-ENOENT); - goto end; - } - entry->refcount = 0; - /* In what module is the probe handler ? */ - probe_module = __module_text_address((unsigned long)entry->probe); - private = remove_marker(entry->name); - deferred_sync = 1; - mutex_unlock(&markers_mutex); - marker_update_probes(probe_module); - return private; -end: - mutex_unlock(&markers_mutex); - return private; + return NULL; } -EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); /** - * marker_arm - Arm a marker - * @name: marker name + * marker_probe_unregister_private_data - Disconnect a probe from a marker + * @probe: probe function + * @probe_private: probe private data * - * Activate a marker. It keeps a reference count of the number of - * arming/disarming done. - * Returns 0 if ok, error value on error. + * Unregister a probe by providing the registered private data. + * Only removes the first marker found in hash table. + * Return 0 on success or error value. + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. */ -int marker_arm(const char *name) +int marker_probe_unregister_private_data(marker_probe_func *probe, + void *probe_private) { struct marker_entry *entry; int ret = 0; + struct marker_probe_closure *old; mutex_lock(&markers_mutex); - entry = get_marker(name); + entry = get_marker_from_private_data(probe, probe_private); if (!entry) { ret = -ENOENT; goto end; } - /* - * Only need to update probes when refcount passes from 0 to 1. - */ - if (entry->refcount++) - goto end; -end: + if (entry->rcu_pending) + rcu_barrier(); + old = marker_entry_remove_probe(entry, NULL, probe_private); mutex_unlock(&markers_mutex); - marker_update_probes(NULL); - return ret; -} -EXPORT_SYMBOL_GPL(marker_arm); - -/** - * marker_disarm - Disarm a marker - * @name: marker name - * - * Disarm a marker. It keeps a reference count of the number of arming/disarming - * done. - * Returns 0 if ok, error value on error. - */ -int marker_disarm(const char *name) -{ - struct marker_entry *entry; - int ret = 0; - + marker_update_probes(); /* may update entry */ mutex_lock(&markers_mutex); - entry = get_marker(name); - if (!entry) { - ret = -ENOENT; - goto end; - } - /* - * Only permit decrement refcount if higher than 0. - * Do probe update only on 1 -> 0 transition. - */ - if (entry->refcount) { - if (--entry->refcount) - goto end; - } else { - ret = -EPERM; - goto end; - } + entry = get_marker_from_private_data(probe, probe_private); + WARN_ON(!entry); + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif + call_rcu(&entry->rcu, free_old_closure); + remove_marker(entry->name); /* Ignore busy error message */ end: mutex_unlock(&markers_mutex); - marker_update_probes(NULL); return ret; } -EXPORT_SYMBOL_GPL(marker_disarm); +EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); /** * marker_get_private_data - Get a marker's probe private data * @name: marker name + * @probe: probe to match + * @num: get the nth matching probe's private data * + * Returns the nth private data pointer (starting from 0) matching, or an + * ERR_PTR. * Returns the private data pointer, or an ERR_PTR. * The private data pointer should _only_ be dereferenced if the caller is the * owner of the data, or its content could vanish. This is mostly used to * confirm that a caller is the owner of a registered probe. */ -void *marker_get_private_data(const char *name) +void *marker_get_private_data(const char *name, marker_probe_func *probe, + int num) { struct hlist_head *head; struct hlist_node *node; struct marker_entry *e; size_t name_len = strlen(name) + 1; u32 hash = jhash(name, name_len-1, 0); - int found = 0; + int i; head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; hlist_for_each_entry(e, node, head, hlist) { if (!strcmp(name, e->name)) { - found = 1; - return e->private; + if (!e->ptype) { + if (num == 0 && e->single.func == probe) + return e->single.probe_private; + else + break; + } else { + struct marker_probe_closure *closure; + int match = 0; + closure = e->multi; + for (i = 0; closure[i].func; i++) { + if (closure[i].func != probe) + continue; + if (match++ == num) + return closure[i].probe_private; + } + } } } return ERR_PTR(-ENOENT); diff --git a/kernel/module.c b/kernel/module.c index 4202da97a1da..5d437bffd8dc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -987,12 +987,11 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, return ret; } - /* * /sys/module/foo/sections stuff * J. Corbet <corbet@lwn.net> */ -#ifdef CONFIG_KALLSYMS +#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) static ssize_t module_sect_show(struct module_attribute *mattr, struct module *mod, char *buf) { @@ -1188,7 +1187,7 @@ static inline void add_notes_attrs(struct module *mod, unsigned int nsect, static inline void remove_notes_attrs(struct module *mod) { } -#endif /* CONFIG_KALLSYMS */ +#endif #ifdef CONFIG_SYSFS int module_add_modinfo_attrs(struct module *mod) @@ -1231,9 +1230,7 @@ void module_remove_modinfo_attrs(struct module *mod) } kfree(mod->modinfo_attrs); } -#endif -#ifdef CONFIG_SYSFS int mod_sysfs_init(struct module *mod) { int err; @@ -1936,8 +1933,15 @@ static struct module *load_module(void __user *umod, /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); + /* + * ndiswrapper is under GPL by itself, but loads proprietary modules. + * Don't use add_taint_module(), as it would prevent ndiswrapper from + * using GPL-only symbols it needs. + */ if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint(TAINT_PROPRIETARY_MODULE); + + /* driverloader was caught wrongly pretending to be under GPL */ if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); @@ -2038,7 +2042,7 @@ static struct module *load_module(void __user *umod, #ifdef CONFIG_MARKERS if (!mod->taints) marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers, NULL, NULL); + mod->markers + mod->num_markers); #endif err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2174,10 +2178,20 @@ sys_init_module(void __user *umod, wake_up(&module_wq); return ret; } + if (ret > 0) { + printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " + "it should follow 0/-E convention\n" + KERN_WARNING "%s: loading module anyway...\n", + __func__, mod->name, ret, + __func__); + dump_stack(); + } - /* Now it's a first class citizen! */ - mutex_lock(&module_mutex); + /* Now it's a first class citizen! Wake up anyone waiting for it. */ mod->state = MODULE_STATE_LIVE; + wake_up(&module_wq); + + mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); unwind_remove_table(mod->unwind_info, 1); @@ -2186,7 +2200,6 @@ sys_init_module(void __user *umod, mod->init_size = 0; mod->init_text_size = 0; mutex_unlock(&module_mutex); - wake_up(&module_wq); return 0; } @@ -2564,7 +2577,7 @@ EXPORT_SYMBOL(struct_module); #endif #ifdef CONFIG_MARKERS -void module_update_markers(struct module *probe_module, int *refcount) +void module_update_markers(void) { struct module *mod; @@ -2572,8 +2585,7 @@ void module_update_markers(struct module *probe_module, int *refcount) list_for_each_entry(mod, &modules, list) if (!mod->taints) marker_update_probe_range(mod->markers, - mod->markers + mod->num_markers, - probe_module, refcount); + mod->markers + mod->num_markers); mutex_unlock(&module_mutex); } #endif diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 022c9c3cee6f..a9b04203a66d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -767,9 +767,11 @@ common_timer_set(struct k_itimer *timr, int flags, /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { /* Setup correct expiry time for relative timers */ - if (mode == HRTIMER_MODE_REL) - timer->expires = ktime_add(timer->expires, - timer->base->get_time()); + if (mode == HRTIMER_MODE_REL) { + timer->expires = + ktime_add_safe(timer->expires, + timer->base->get_time()); + } return 0; } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 79833170bb9c..6233f3b4ae66 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -190,7 +190,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read <file:Documentation/pm.txt> and the + and more information, read <file:Documentation/power/pm.txt> and the Battery Powered Linux mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>. diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 859a8e59773a..14a656cdc652 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -391,7 +391,7 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); - error = device_suspend(PMSG_SUSPEND); + error = device_suspend(PMSG_HIBERNATE); if (error) goto Resume_console; @@ -404,7 +404,7 @@ int hibernation_platform_enter(void) goto Finish; local_irq_disable(); - error = device_power_down(PMSG_SUSPEND); + error = device_power_down(PMSG_HIBERNATE); if (!error) { hibernation_ops->enter(); /* We should never get here */ diff --git a/kernel/power/process.c b/kernel/power/process.c index 7c2118f9597f..f1d0b345c9ba 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -75,22 +75,15 @@ void refrigerator(void) __set_current_state(save); } -static void fake_signal_wake_up(struct task_struct *p, int resume) +static void fake_signal_wake_up(struct task_struct *p) { unsigned long flags; spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, resume); + signal_wake_up(p, 0); spin_unlock_irqrestore(&p->sighand->siglock, flags); } -static void send_fake_signal(struct task_struct *p) -{ - if (task_is_stopped(p)) - force_sig_specific(SIGSTOP, p); - fake_signal_wake_up(p, task_is_stopped(p)); -} - static int has_mm(struct task_struct *p) { return (p->mm && !(p->flags & PF_BORROWED_MM)); @@ -121,7 +114,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only) if (freezing(p)) { if (has_mm(p)) { if (!signal_pending(p)) - fake_signal_wake_up(p, 0); + fake_signal_wake_up(p); } else { if (with_mm_only) ret = 0; @@ -135,7 +128,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only) } else { if (has_mm(p)) { set_freeze_flag(p); - send_fake_signal(p); + fake_signal_wake_up(p); } else { if (with_mm_only) { ret = 0; @@ -182,15 +175,17 @@ static int try_to_freeze_tasks(int freeze_user_space) if (frozen(p) || !freezeable(p)) continue; - if (task_is_traced(p) && frozen(p->parent)) { - cancel_freezing(p); - continue; - } - if (!freeze_task(p, freeze_user_space)) continue; - if (!freezer_should_skip(p)) + /* + * Now that we've done set_freeze_flag, don't + * perturb a task in TASK_STOPPED or TASK_TRACED. + * It is "frozen enough". If the task does wake + * up, it will immediately call try_to_freeze. + */ + if (!task_is_stopped_or_traced(p) && + !freezer_should_skip(p)) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 95250d7c8d91..5f91a07c4eac 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * of @bm->cur_zone_bm are updated. */ -static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, +static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) { struct zone_bitmap *zone_bm; @@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { zone_bm = zone_bm->next; - BUG_ON(!zone_bm); + if (!zone_bm) + return -EFAULT; } bm->cur.zone_bm = zone_bm; } @@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, pfn -= bb->start_pfn; *bit_nr = pfn % BM_BITS_PER_CHUNK; *addr = bb->data + pfn / BM_BITS_PER_CHUNK; + return 0; } static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); set_bit(bit, addr); } +static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + if (!error) + set_bit(bit, addr); + return error; +} + static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); clear_bit(bit, addr); } @@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); return test_bit(bit, addr); } @@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm) region->end_pfn << PAGE_SHIFT); for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) - memory_bm_set_bit(bm, pfn); + if (pfn_valid(pfn)) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } } } @@ -875,8 +902,8 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } #endif /* CONFIG_HIGHMEM */ /** - * saveable - Determine whether a non-highmem page should be included in - * the suspend image. + * saveable_page - Determine whether a non-highmem page should be included + * in the suspend image. * * We should save the page if it isn't Nosave, and is not in the range * of pages statically defined as 'unsaveable', and it isn't a part of @@ -897,7 +924,8 @@ static struct page *saveable_page(unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; - if (PageReserved(page) && pfn_is_nosave(pfn)) + if (PageReserved(page) + && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; return page; @@ -938,6 +966,25 @@ static inline void do_copy_page(long *dst, long *src) *dst++ = *src++; } + +/** + * safe_copy_page - check if the page we are going to copy is marked as + * present in the kernel page tables (this always is the case if + * CONFIG_DEBUG_PAGEALLOC is not set and in that case + * kernel_page_present() always returns 'true'). + */ +static void safe_copy_page(void *dst, struct page *s_page) +{ + if (kernel_page_present(s_page)) { + do_copy_page(dst, page_address(s_page)); + } else { + kernel_map_pages(s_page, 1, 1); + do_copy_page(dst, page_address(s_page)); + kernel_map_pages(s_page, 1, 0); + } +} + + #ifdef CONFIG_HIGHMEM static inline struct page * page_is_saveable(struct zone *zone, unsigned long pfn) @@ -946,8 +993,7 @@ page_is_saveable(struct zone *zone, unsigned long pfn) saveable_highmem_page(pfn) : saveable_page(pfn); } -static inline void -copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { struct page *s_page, *d_page; void *src, *dst; @@ -961,29 +1007,26 @@ copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) kunmap_atomic(src, KM_USER0); kunmap_atomic(dst, KM_USER1); } else { - src = page_address(s_page); if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel * data modified by kmap_atomic() */ - do_copy_page(buffer, src); + safe_copy_page(buffer, s_page); dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); memcpy(dst, buffer, PAGE_SIZE); kunmap_atomic(dst, KM_USER0); } else { - dst = page_address(d_page); - do_copy_page(dst, src); + safe_copy_page(page_address(d_page), s_page); } } } #else #define page_is_saveable(zone, pfn) saveable_page(pfn) -static inline void -copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { - do_copy_page(page_address(pfn_to_page(dst_pfn)), - page_address(pfn_to_page(src_pfn))); + safe_copy_page(page_address(pfn_to_page(dst_pfn)), + pfn_to_page(src_pfn)); } #endif /* CONFIG_HIGHMEM */ diff --git a/kernel/printk.c b/kernel/printk.c index bee36100f110..bdd4ea8c3f2b 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -616,6 +616,53 @@ asmlinkage int printk(const char *fmt, ...) /* cpu currently holding logbuf_lock */ static volatile unsigned int printk_cpu = UINT_MAX; +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. + */ +static inline int can_use_console(unsigned int cpu) +{ + return cpu_online(cpu) || have_callable_console(); +} + +/* + * Try to get console ownership to actually show the kernel + * messages from a 'printk'. Return true (and with the + * console_semaphore held, and 'console_locked' set) if it + * is successful, false otherwise. + * + * This gets called with the 'logbuf_lock' spinlock held and + * interrupts disabled. It should return with 'lockbuf_lock' + * released but interrupts still disabled. + */ +static int acquire_console_semaphore_for_printk(unsigned int cpu) +{ + int retval = 0; + + if (!try_acquire_console_sem()) { + retval = 1; + + /* + * If we can't use the console, we need to release + * the console semaphore by hand to avoid flushing + * the buffer. We need to hold the console semaphore + * in order to do this test safely. + */ + if (!can_use_console(cpu)) { + console_locked = 0; + up(&console_sem); + retval = 0; + } + } + printk_cpu = UINT_MAX; + spin_unlock(&logbuf_lock); + return retval; +} + const char printk_recursion_bug_msg [] = KERN_CRIT "BUG: recent printk recursion!\n"; static int printk_recursion_bug; @@ -666,7 +713,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) } /* Emit the output into the temporary buffer */ printed_len += vscnprintf(printk_buf + printed_len, - sizeof(printk_buf), fmt, args); + sizeof(printk_buf) - printed_len, fmt, args); /* * Copy the output into log_buf. If the caller didn't provide @@ -725,43 +772,22 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!down_trylock(&console_sem)) { - /* - * We own the drivers. We can drop the spinlock and - * let release_console_sem() print the text, maybe ... - */ - console_locked = 1; - printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); + /* + * Try to acquire and then immediately release the + * console semaphore. The release will do all the + * actual magic (print out buffers, wake up klogd, + * etc). + * + * The acquire_console_semaphore_for_printk() function + * will release 'logbuf_lock' regardless of whether it + * actually gets the semaphore or not. + */ + if (acquire_console_semaphore_for_printk(this_cpu)) + release_console_sem(); - /* - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. - */ - if (cpu_online(smp_processor_id()) || have_callable_console()) { - console_may_schedule = 0; - release_console_sem(); - } else { - /* Release by hand to avoid flushing the buffer. */ - console_locked = 0; - up(&console_sem); - } - lockdep_on(); - raw_local_irq_restore(flags); - } else { - /* - * Someone else owns the drivers. We drop the spinlock, which - * allows the semaphore holder to proceed and to call the - * console drivers with the output which we just produced. - */ - printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); - lockdep_on(); + lockdep_on(); out_restore_irqs: - raw_local_irq_restore(flags); - } + raw_local_irq_restore(flags); preempt_enable(); return printed_len; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 760dfc233a00..c09605f8d16c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -56,7 +56,10 @@ static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); static struct completion rcu_barrier_completion; -/* Because of FASTCALL declaration of complete, we use this wrapper */ +/* + * Awaken the corresponding synchronize_rcu() instance now that a + * grace period has elapsed. + */ static void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 987cfb7ade89..e9517014b57c 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -23,6 +23,10 @@ * to Suparna Bhattacharya for pushing me completely away * from atomic instructions on the read side. * + * - Added handling of Dynamic Ticks + * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com> + * - Steven Rostedt <srostedt@redhat.com> + * * Papers: http://www.rdrop.com/users/paulmck/RCU * * Design Document: http://lwn.net/Articles/253651/ @@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) } } +#ifdef CONFIG_NO_HZ + +DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; +static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); +static DEFINE_PER_CPU(int, rcu_update_flag); + +/** + * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. + * + * If the CPU was idle with dynamic ticks active, this updates the + * dynticks_progress_counter to let the RCU handling know that the + * CPU is active. + */ +void rcu_irq_enter(void) +{ + int cpu = smp_processor_id(); + + if (per_cpu(rcu_update_flag, cpu)) + per_cpu(rcu_update_flag, cpu)++; + + /* + * Only update if we are coming from a stopped ticks mode + * (dynticks_progress_counter is even). + */ + if (!in_interrupt() && + (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { + /* + * The following might seem like we could have a race + * with NMI/SMIs. But this really isn't a problem. + * Here we do a read/modify/write, and the race happens + * when an NMI/SMI comes in after the read and before + * the write. But NMI/SMIs will increment this counter + * twice before returning, so the zero bit will not + * be corrupted by the NMI/SMI which is the most important + * part. + * + * The only thing is that we would bring back the counter + * to a postion that it was in during the NMI/SMI. + * But the zero bit would be set, so the rest of the + * counter would again be ignored. + * + * On return from the IRQ, the counter may have the zero + * bit be 0 and the counter the same as the return from + * the NMI/SMI. If the state machine was so unlucky to + * see that, it still doesn't matter, since all + * RCU read-side critical sections on this CPU would + * have already completed. + */ + per_cpu(dynticks_progress_counter, cpu)++; + /* + * The following memory barrier ensures that any + * rcu_read_lock() primitives in the irq handler + * are seen by other CPUs to follow the above + * increment to dynticks_progress_counter. This is + * required in order for other CPUs to correctly + * determine when it is safe to advance the RCU + * grace-period state machine. + */ + smp_mb(); /* see above block comment. */ + /* + * Since we can't determine the dynamic tick mode from + * the dynticks_progress_counter after this routine, + * we use a second flag to acknowledge that we came + * from an idle state with ticks stopped. + */ + per_cpu(rcu_update_flag, cpu)++; + /* + * If we take an NMI/SMI now, they will also increment + * the rcu_update_flag, and will not update the + * dynticks_progress_counter on exit. That is for + * this IRQ to do. + */ + } +} + +/** + * rcu_irq_exit - Called from exiting Hard irq context. + * + * If the CPU was idle with dynamic ticks active, update the + * dynticks_progress_counter to put let the RCU handling be + * aware that the CPU is going back to idle with no ticks. + */ +void rcu_irq_exit(void) +{ + int cpu = smp_processor_id(); + + /* + * rcu_update_flag is set if we interrupted the CPU + * when it was idle with ticks stopped. + * Once this occurs, we keep track of interrupt nesting + * because a NMI/SMI could also come in, and we still + * only want the IRQ that started the increment of the + * dynticks_progress_counter to be the one that modifies + * it on exit. + */ + if (per_cpu(rcu_update_flag, cpu)) { + if (--per_cpu(rcu_update_flag, cpu)) + return; + + /* This must match the interrupt nesting */ + WARN_ON(in_interrupt()); + + /* + * If an NMI/SMI happens now we are still + * protected by the dynticks_progress_counter being odd. + */ + + /* + * The following memory barrier ensures that any + * rcu_read_unlock() primitives in the irq handler + * are seen by other CPUs to preceed the following + * increment to dynticks_progress_counter. This + * is required in order for other CPUs to determine + * when it is safe to advance the RCU grace-period + * state machine. + */ + smp_mb(); /* see above block comment. */ + per_cpu(dynticks_progress_counter, cpu)++; + WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); + } +} + +static void dyntick_save_progress_counter(int cpu) +{ + per_cpu(rcu_dyntick_snapshot, cpu) = + per_cpu(dynticks_progress_counter, cpu); +} + +static inline int +rcu_try_flip_waitack_needed(int cpu) +{ + long curr; + long snap; + + curr = per_cpu(dynticks_progress_counter, cpu); + snap = per_cpu(rcu_dyntick_snapshot, cpu); + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot be in the middle of an rcu_read_lock(), so + * the next rcu_read_lock() it executes must use the new value + * of the counter. So we can safely pretend that this CPU + * already acknowledged the counter. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq handlers, then, as above, we can safely pretend + * that this CPU already acknowledged the counter. + */ + + if ((curr - snap) > 2 || (snap & 0x1) == 0) + return 0; + + /* We need this CPU to explicitly acknowledge the counter flip. */ + + return 1; +} + +static inline int +rcu_try_flip_waitmb_needed(int cpu) +{ + long curr; + long snap; + + curr = per_cpu(dynticks_progress_counter, cpu); + snap = per_cpu(rcu_dyntick_snapshot, cpu); + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot have executed an RCU read-side critical section + * during that time, so there is no need for it to execute a + * memory barrier. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU either entered or exited an outermost interrupt, + * SMI, NMI, or whatever handler, then we know that it executed + * a memory barrier when doing so. So we don't need another one. + */ + if (curr != snap) + return 0; + + /* We need the CPU to execute a memory barrier. */ + + return 1; +} + +#else /* !CONFIG_NO_HZ */ + +# define dyntick_save_progress_counter(cpu) do { } while (0) +# define rcu_try_flip_waitack_needed(cpu) (1) +# define rcu_try_flip_waitmb_needed(cpu) (1) + +#endif /* CONFIG_NO_HZ */ + /* * Get here when RCU is idle. Decide whether we need to * move out of idle state, and return non-zero if so. @@ -447,8 +657,10 @@ rcu_try_flip_idle(void) /* Now ask each CPU for acknowledgement of the flip. */ - for_each_cpu_mask(cpu, rcu_cpu_online_map) + for_each_cpu_mask(cpu, rcu_cpu_online_map) { per_cpu(rcu_flip_flag, cpu) = rcu_flipped; + dyntick_save_progress_counter(cpu); + } return 1; } @@ -464,7 +676,8 @@ rcu_try_flip_waitack(void) RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); for_each_cpu_mask(cpu, rcu_cpu_online_map) - if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { + if (rcu_try_flip_waitack_needed(cpu) && + per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); return 0; } @@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void) smp_mb(); /* ^^^^^^^^^^^^ */ /* Call for a memory barrier from each CPU. */ - for_each_cpu_mask(cpu, rcu_cpu_online_map) + for_each_cpu_mask(cpu, rcu_cpu_online_map) { per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; + dyntick_save_progress_counter(cpu); + } RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); return 1; @@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void) RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); for_each_cpu_mask(cpu, rcu_cpu_online_map) - if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { + if (rcu_try_flip_waitmb_needed(cpu) && + per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); return 0; } @@ -702,8 +918,9 @@ void rcu_offline_cpu(int cpu) * fix. */ + local_irq_save(flags); rdp = RCU_DATA_ME(); - spin_lock_irqsave(&rdp->lock, flags); + spin_lock(&rdp->lock); *rdp->nexttail = list; if (list) rdp->nexttail = tail; @@ -735,9 +952,11 @@ static void rcu_process_callbacks(struct softirq_action *unused) { unsigned long flags; struct rcu_head *next, *list; - struct rcu_data *rdp = RCU_DATA_ME(); + struct rcu_data *rdp; - spin_lock_irqsave(&rdp->lock, flags); + local_irq_save(flags); + rdp = RCU_DATA_ME(); + spin_lock(&rdp->lock); list = rdp->donelist; if (list == NULL) { spin_unlock_irqrestore(&rdp->lock, flags); diff --git a/kernel/relay.c b/kernel/relay.c index d080b9d161a7..d6204a485818 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -736,7 +736,7 @@ static int relay_file_open(struct inode *inode, struct file *filp) kref_get(&buf->kref); filp->private_data = buf; - return 0; + return nonseekable_open(inode, filp); } /** @@ -1056,6 +1056,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) +{ +} + /* * subbuf_splice_actor - splice up to one subbuf's worth of data */ @@ -1066,7 +1070,7 @@ static int subbuf_splice_actor(struct file *in, unsigned int flags, int *nonpad_ret) { - unsigned int pidx, poff, total_len, subbuf_pages, ret; + unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; struct rchan_buf *rbuf = in->private_data; unsigned int subbuf_size = rbuf->chan->subbuf_size; uint64_t pos = (uint64_t) *ppos; @@ -1083,6 +1087,7 @@ static int subbuf_splice_actor(struct file *in, .partial = partial, .flags = flags, .ops = &relay_pipe_buf_ops, + .spd_release = relay_page_release, }; if (rbuf->subbufs_produced == rbuf->subbufs_consumed) @@ -1097,8 +1102,9 @@ static int subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; + nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); - for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { + for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; unsigned int cur_pos = read_start + total_len; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 16cbec2d5d60..efbfc0fc232f 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -113,6 +113,7 @@ ssize_t res_counter_write(struct res_counter *counter, int member, ret = -EINVAL; + strstrip(buf); if (write_strategy) { if (write_strategy(buf, &tmp)) { goto out_free; diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 0deef71ff8d2..6522ae5b14a2 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -630,9 +630,12 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(state); /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) + if (unlikely(timeout)) { hrtimer_start(&timeout->timer, timeout->timer.expires, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } for (;;) { /* Try to acquire the lock: */ diff --git a/kernel/sched.c b/kernel/sched.c index 3eedd5260907..8dcdec6fe0fe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -155,7 +155,7 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED #include <linux/cgroup.h> @@ -165,118 +165,88 @@ static LIST_HEAD(task_groups); /* task group related information */ struct task_group { -#ifdef CONFIG_FAIR_CGROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED struct cgroup_subsys_state css; #endif + +#ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; + unsigned long shares; +#endif +#ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; - unsigned int rt_ratio; - - /* - * shares assigned to a task group governs how much of cpu bandwidth - * is allocated to the group. The more shares a group has, the more is - * the cpu bandwidth allocated to it. - * - * For ex, lets say that there are three task groups, A, B and C which - * have been assigned shares 1000, 2000 and 3000 respectively. Then, - * cpu bandwidth allocated by the scheduler to task groups A, B and C - * should be: - * - * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% - * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% - * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% - * - * The weight assigned to a task group's schedulable entities on every - * cpu (task_group.se[a_cpu]->load.weight) is derived from the task - * group's shares. For ex: lets say that task group A has been - * assigned shares of 1000 and there are two CPUs in a system. Then, - * - * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; - * - * Note: It's not necessary that each of a task's group schedulable - * entity have the same weight on all CPUs. If the group - * has 2 of its tasks on CPU0 and 1 task on CPU1, then a - * better distribution of weight could be: - * - * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 - * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 - * - * rebalance_shares() is responsible for distributing the shares of a - * task groups like this among the group's schedulable entities across - * cpus. - * - */ - unsigned long shares; + u64 rt_runtime; +#endif struct rcu_head rcu; struct list_head list; }; +#ifdef CONFIG_FAIR_GROUP_SCHED /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; - static struct sched_entity *init_sched_entity_p[NR_CPUS]; static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; static struct rt_rq *init_rt_rq_p[NR_CPUS]; +#endif -/* task_group_mutex serializes add/remove of task groups and also changes to +/* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. */ -static DEFINE_MUTEX(task_group_mutex); +static DEFINE_SPINLOCK(task_group_lock); /* doms_cur_mutex serializes access to doms_cur[] array */ static DEFINE_MUTEX(doms_cur_mutex); -#ifdef CONFIG_SMP -/* kernel thread that runs rebalance_shares() periodically */ -static struct task_struct *lb_monitor_task; -static int load_balance_monitor(void *unused); +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_USER_SCHED +# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) +#else +# define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif -static void set_se_shares(struct sched_entity *se, unsigned long shares); +static int init_task_group_load = INIT_TASK_GROUP_LOAD; +#endif /* Default task group. * Every task in system belong to this group at bootup. */ struct task_group init_task_group = { +#ifdef CONFIG_FAIR_GROUP_SCHED .se = init_sched_entity_p, .cfs_rq = init_cfs_rq_p, +#endif +#ifdef CONFIG_RT_GROUP_SCHED .rt_se = init_sched_rt_entity_p, .rt_rq = init_rt_rq_p, -}; - -#ifdef CONFIG_FAIR_USER_SCHED -# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) -#else -# define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif - -#define MIN_GROUP_SHARES 2 - -static int init_task_group_load = INIT_TASK_GROUP_LOAD; +}; /* return group to which a task belongs */ static inline struct task_group *task_group(struct task_struct *p) { struct task_group *tg; -#ifdef CONFIG_FAIR_USER_SCHED +#ifdef CONFIG_USER_SCHED tg = p->user->tg; -#elif defined(CONFIG_FAIR_CGROUP_SCHED) +#elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); #else @@ -288,21 +258,15 @@ static inline struct task_group *task_group(struct task_struct *p) /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { +#ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; +#endif +#ifdef CONFIG_RT_GROUP_SCHED p->rt.rt_rq = task_group(p)->rt_rq[cpu]; p->rt.parent = task_group(p)->rt_se[cpu]; -} - -static inline void lock_task_group_list(void) -{ - mutex_lock(&task_group_mutex); -} - -static inline void unlock_task_group_list(void) -{ - mutex_unlock(&task_group_mutex); +#endif } static inline void lock_doms_cur(void) @@ -318,12 +282,10 @@ static inline void unlock_doms_cur(void) #else static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline void lock_task_group_list(void) { } -static inline void unlock_task_group_list(void) { } static inline void lock_doms_cur(void) { } static inline void unlock_doms_cur(void) { } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_GROUP_SCHED */ /* CFS-related fields in a runqueue */ struct cfs_rq { @@ -339,7 +301,7 @@ struct cfs_rq { /* 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr; + struct sched_entity *curr, *next; unsigned long nr_spread_over; @@ -363,7 +325,7 @@ struct cfs_rq { struct rt_rq { struct rt_prio_array active; unsigned long rt_nr_running; -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED int highest_prio; /* highest queued rt task prio */ #endif #ifdef CONFIG_SMP @@ -373,7 +335,9 @@ struct rt_rq { int rt_throttled; u64 rt_time; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + struct rq *rq; struct list_head leaf_rt_rq_list; struct task_group *tg; @@ -447,6 +411,8 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; +#endif +#ifdef CONFIG_RT_GROUP_SCHED struct list_head leaf_rt_rq_list; #endif @@ -628,18 +594,14 @@ enum { SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, SCHED_FEAT_WAKEUP_PREEMPT = 2, SCHED_FEAT_START_DEBIT = 4, - SCHED_FEAT_TREE_AVG = 8, - SCHED_FEAT_APPROX_AVG = 16, - SCHED_FEAT_HRTICK = 32, - SCHED_FEAT_DOUBLE_TICK = 64, + SCHED_FEAT_HRTICK = 8, + SCHED_FEAT_DOUBLE_TICK = 16, }; const_debug unsigned int sysctl_sched_features = SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_START_DEBIT * 1 | - SCHED_FEAT_TREE_AVG * 0 | - SCHED_FEAT_APPROX_AVG * 0 | SCHED_FEAT_HRTICK * 1 | SCHED_FEAT_DOUBLE_TICK * 0; @@ -652,19 +614,23 @@ const_debug unsigned int sysctl_sched_features = const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * period over which we measure -rt task cpu usage in ms. + * period over which we measure -rt task cpu usage in us. * default: 1s */ -const_debug unsigned int sysctl_sched_rt_period = 1000; +unsigned int sysctl_sched_rt_period = 1000000; -#define SCHED_RT_FRAC_SHIFT 16 -#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) +static __read_mostly int scheduler_running; /* - * ratio of time -rt tasks may consume. - * default: 95% + * part of the period that we allow rt tasks to run in us. + * default: 0.95s */ -const_debug unsigned int sysctl_sched_rt_ratio = 62259; +int sysctl_sched_rt_runtime = 950000; + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu @@ -676,14 +642,16 @@ unsigned long long cpu_clock(int cpu) unsigned long flags; struct rq *rq; - local_irq_save(flags); - rq = cpu_rq(cpu); /* * Only call sched_clock() if the scheduler has already been * initialized (some code might call cpu_clock() very early): */ - if (rq->idle) - update_rq_clock(rq); + if (unlikely(!scheduler_running)) + return 0; + + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); now = rq->clock; local_irq_restore(flags); @@ -1084,6 +1052,49 @@ static void resched_cpu(int cpu) resched_task(cpu_curr(cpu)); spin_unlock_irqrestore(&rq->lock, flags); } + +#ifdef CONFIG_NO_HZ +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} +#endif + #else static void __resched_task(struct task_struct *p, int tif_bit) { @@ -1112,7 +1123,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, u64 tmp; if (unlikely(!lw->inv_weight)) - lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; + lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); tmp = (u64)delta_exec * weight; /* @@ -1136,11 +1147,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; + lw->inv_weight = 0; } static inline void update_load_sub(struct load_weight *lw, unsigned long dec) { lw->weight -= dec; + lw->inv_weight = 0; } /* @@ -1228,16 +1241,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif -static inline void inc_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_add(&rq->load, load); -} - -static inline void dec_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_sub(&rq->load, load); -} - #ifdef CONFIG_SMP static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); @@ -1255,14 +1258,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #define sched_class_highest (&rt_sched_class) -static void inc_nr_running(struct rq *rq) +static inline void inc_load(struct rq *rq, const struct task_struct *p) +{ + update_load_add(&rq->load, p->se.load.weight); +} + +static inline void dec_load(struct rq *rq, const struct task_struct *p) +{ + update_load_sub(&rq->load, p->se.load.weight); +} + +static void inc_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running++; + inc_load(rq, p); } -static void dec_nr_running(struct rq *rq) +static void dec_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running--; + dec_load(rq, p); } static void set_load_weight(struct task_struct *p) @@ -1354,7 +1369,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(rq); + inc_nr_running(p, rq); } /* @@ -1366,7 +1381,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(rq); + dec_nr_running(p, rq); } /** @@ -1420,6 +1435,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + /* + * Buddy candidates are cache hot: + */ + if (&p->se == cfs_rq_of(&p->se)->next) + return 1; + if (p->sched_class != &fair_sched_class) return 0; @@ -1818,6 +1839,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) long old_state; struct rq *rq; + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) @@ -1878,10 +1900,11 @@ out_activate: schedstat_inc(p, se.nr_wakeups_remote); update_rq_clock(rq); activate_task(rq, p, 1); - check_preempt_curr(rq, p); success = 1; out_running: + check_preempt_curr(rq, p); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -1915,6 +1938,8 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.last_wakeup = 0; + p->se.avg_overlap = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -2005,7 +2030,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(rq); + inc_nr_running(p, rq); } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -3753,7 +3778,7 @@ void scheduler_tick(void) #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) -void add_preempt_count(int val) +void __kprobes add_preempt_count(int val) { /* * Underflow? @@ -3769,7 +3794,7 @@ void add_preempt_count(int val) } EXPORT_SYMBOL(add_preempt_count); -void sub_preempt_count(int val) +void __kprobes sub_preempt_count(int val) { /* * Underflow? @@ -3871,7 +3896,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev) asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; - long *switch_count; + unsigned long *switch_count; struct rq *rq; int cpu; @@ -3900,7 +3925,7 @@ need_resched_nonpreemptible: if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) { + signal_pending(prev))) { prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, 1); @@ -4293,11 +4318,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; running = task_current(rq, p); - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } + if (running) + p->sched_class->put_prev_task(rq, p); if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -4306,10 +4330,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; + if (running) + p->sched_class->set_curr_task(rq); if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); - enqueue_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio, running); @@ -4344,8 +4367,10 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) + if (on_rq) { dequeue_task(rq, p, 0); + dec_load(rq, p); + } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4355,6 +4380,7 @@ void set_user_nice(struct task_struct *p, long nice) if (on_rq) { enqueue_task(rq, p, 0); + inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4444,7 +4470,7 @@ int task_nice(const struct task_struct *p) { return TASK_NICE(p); } -EXPORT_SYMBOL_GPL(task_nice); +EXPORT_SYMBOL(task_nice); /** * idle_cpu - is a given cpu idle currently? @@ -4571,6 +4597,15 @@ recheck: return -EPERM; } +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_policy(policy) && task_group(p)->rt_runtime == 0) + return -EPERM; +#endif + retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -4594,19 +4629,17 @@ recheck: update_rq_clock(rq); on_rq = p->se.on_rq; running = task_current(rq, p); - if (on_rq) { + if (on_rq) deactivate_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } + if (running) + p->sched_class->put_prev_task(rq, p); oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); + if (running) + p->sched_class->set_curr_task(rq); if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); - activate_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio, running); @@ -5113,7 +5146,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) time_slice = 0; if (p->policy == SCHED_RR) { time_slice = DEF_TIMESLICE; - } else { + } else if (p->policy != SCHED_FIFO) { struct sched_entity *se = &p->se; unsigned long flags; struct rq *rq; @@ -5894,7 +5927,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_unlock_irq(&rq->lock); break; - case CPU_DOWN_PREPARE: + case CPU_DYING: + case CPU_DYING_FROZEN: /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); @@ -6816,6 +6850,10 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ */ static cpumask_t fallback_doms; +void __attribute__((weak)) arch_update_cpu_topology(void) +{ +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -6825,6 +6863,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) { int err; + arch_update_cpu_topology(); ndoms_cur = 1; doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) @@ -6929,7 +6968,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static int arch_reinit_sched_domains(void) +int arch_reinit_sched_domains(void) { int err; @@ -7060,21 +7099,6 @@ void __init sched_init_smp(void) if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (nr_cpu_ids == 1) - return; - - lb_monitor_task = kthread_create(load_balance_monitor, NULL, - "group_balance"); - if (!IS_ERR(lb_monitor_task)) { - lb_monitor_task->flags |= PF_NOFREEZE; - wake_up_process(lb_monitor_task); - } else { - printk(KERN_ERR "Could not create load balance monitor thread" - "(error = %ld) \n", PTR_ERR(lb_monitor_task)); - } -#endif } #else void __init sched_init_smp(void) @@ -7112,7 +7136,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED rt_rq->highest_prio = MAX_RT_PRIO; #endif #ifdef CONFIG_SMP @@ -7123,7 +7147,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq->rt_nr_boosted = 0; rt_rq->rq = rq; #endif } @@ -7146,7 +7171,9 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); se->parent = NULL; } +#endif +#ifdef CONFIG_RT_GROUP_SCHED static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, int add) @@ -7175,7 +7202,7 @@ void __init sched_init(void) init_defrootdomain(); #endif -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); #endif @@ -7196,7 +7223,10 @@ void __init sched_init(void) &per_cpu(init_cfs_rq, i), &per_cpu(init_sched_entity, i), i, 1); - init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ +#endif +#ifdef CONFIG_RT_GROUP_SCHED + init_task_group.rt_runtime = + sysctl_sched_rt_runtime * NSEC_PER_USEC; INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(rq, &init_task_group, &per_cpu(init_rt_rq, i), @@ -7255,6 +7285,8 @@ void __init sched_init(void) * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; + + scheduler_running = 1; } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -7303,7 +7335,7 @@ void normalize_rt_tasks(void) unsigned long flags; struct rq *rq; - read_lock_irq(&tasklist_lock); + read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { /* * Only normalize user tasks: @@ -7329,16 +7361,16 @@ void normalize_rt_tasks(void) continue; } - spin_lock_irqsave(&p->pi_lock, flags); + spin_lock(&p->pi_lock); rq = __task_rq_lock(p); normalize_task(rq, p); __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + spin_unlock(&p->pi_lock); } while_each_thread(g, p); - read_unlock_irq(&tasklist_lock); + read_unlock_irqrestore(&tasklist_lock, flags); } #endif /* CONFIG_MAGIC_SYSRQ */ @@ -7387,202 +7419,114 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED -#ifdef CONFIG_SMP -/* - * distribute shares of all task groups among their schedulable entities, - * to reflect load distribution across cpus. - */ -static int rebalance_shares(struct sched_domain *sd, int this_cpu) +#ifdef CONFIG_FAIR_GROUP_SCHED +static void free_fair_sched_group(struct task_group *tg) { - struct cfs_rq *cfs_rq; - struct rq *rq = cpu_rq(this_cpu); - cpumask_t sdspan = sd->span; - int balanced = 1; - - /* Walk thr' all the task groups that we have */ - for_each_leaf_cfs_rq(rq, cfs_rq) { - int i; - unsigned long total_load = 0, total_shares; - struct task_group *tg = cfs_rq->tg; - - /* Gather total task load of this group across cpus */ - for_each_cpu_mask(i, sdspan) - total_load += tg->cfs_rq[i]->load.weight; - - /* Nothing to do if this group has no load */ - if (!total_load) - continue; - - /* - * tg->shares represents the number of cpu shares the task group - * is eligible to hold on a single cpu. On N cpus, it is - * eligible to hold (N * tg->shares) number of cpu shares. - */ - total_shares = tg->shares * cpus_weight(sdspan); - - /* - * redistribute total_shares across cpus as per the task load - * distribution. - */ - for_each_cpu_mask(i, sdspan) { - unsigned long local_load, local_shares; - - local_load = tg->cfs_rq[i]->load.weight; - local_shares = (local_load * total_shares) / total_load; - if (!local_shares) - local_shares = MIN_GROUP_SHARES; - if (local_shares == tg->se[i]->load.weight) - continue; + int i; - spin_lock_irq(&cpu_rq(i)->lock); - set_se_shares(tg->se[i], local_shares); - spin_unlock_irq(&cpu_rq(i)->lock); - balanced = 0; - } + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); } - return balanced; + kfree(tg->cfs_rq); + kfree(tg->se); } -/* - * How frequently should we rebalance_shares() across cpus? - * - * The more frequently we rebalance shares, the more accurate is the fairness - * of cpu bandwidth distribution between task groups. However higher frequency - * also implies increased scheduling overhead. - * - * sysctl_sched_min_bal_int_shares represents the minimum interval between - * consecutive calls to rebalance_shares() in the same sched domain. - * - * sysctl_sched_max_bal_int_shares represents the maximum interval between - * consecutive calls to rebalance_shares() in the same sched domain. - * - * These settings allows for the appropriate trade-off between accuracy of - * fairness and the associated overhead. - * - */ - -/* default: 8ms, units: milliseconds */ -const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; - -/* default: 128ms, units: milliseconds */ -const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; - -/* kernel thread that runs rebalance_shares() periodically */ -static int load_balance_monitor(void *unused) +static int alloc_fair_sched_group(struct task_group *tg) { - unsigned int timeout = sysctl_sched_min_bal_int_shares; - struct sched_param schedparm; - int ret; + struct cfs_rq *cfs_rq; + struct sched_entity *se; + struct rq *rq; + int i; - /* - * We don't want this thread's execution to be limited by the shares - * assigned to default group (init_task_group). Hence make it run - * as a SCHED_RR RT task at the lowest priority. - */ - schedparm.sched_priority = 1; - ret = sched_setscheduler(current, SCHED_RR, &schedparm); - if (ret) - printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" - " monitor thread (error = %d) \n", ret); + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); + if (!tg->se) + goto err; - while (!kthread_should_stop()) { - int i, cpu, balanced = 1; + tg->shares = NICE_0_LOAD; - /* Prevent cpus going down or coming up */ - get_online_cpus(); - /* lockout changes to doms_cur[] array */ - lock_doms_cur(); - /* - * Enter a rcu read-side critical section to safely walk rq->sd - * chain on various cpus and to walk task group list - * (rq->leaf_cfs_rq_list) in rebalance_shares(). - */ - rcu_read_lock(); + for_each_possible_cpu(i) { + rq = cpu_rq(i); - for (i = 0; i < ndoms_cur; i++) { - cpumask_t cpumap = doms_cur[i]; - struct sched_domain *sd = NULL, *sd_prev = NULL; + cfs_rq = kmalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + if (!cfs_rq) + goto err; - cpu = first_cpu(cpumap); + se = kmalloc_node(sizeof(struct sched_entity), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + if (!se) + goto err; - /* Find the highest domain at which to balance shares */ - for_each_domain(cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - sd_prev = sd; - } + init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); + } - sd = sd_prev; - /* sd == NULL? No load balance reqd in this domain */ - if (!sd) - continue; + return 1; - balanced &= rebalance_shares(sd, cpu); - } + err: + return 0; +} - rcu_read_unlock(); +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, + &cpu_rq(cpu)->leaf_cfs_rq_list); +} - unlock_doms_cur(); - put_online_cpus(); +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); +} +#else +static inline void free_fair_sched_group(struct task_group *tg) +{ +} - if (!balanced) - timeout = sysctl_sched_min_bal_int_shares; - else if (timeout < sysctl_sched_max_bal_int_shares) - timeout *= 2; +static inline int alloc_fair_sched_group(struct task_group *tg) +{ + return 1; +} - msleep_interruptible(timeout); - } +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ +} - return 0; +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ } -#endif /* CONFIG_SMP */ +#endif -static void free_sched_group(struct task_group *tg) +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg) { int i; for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); if (tg->rt_rq) kfree(tg->rt_rq[i]); if (tg->rt_se) kfree(tg->rt_se[i]); } - kfree(tg->cfs_rq); - kfree(tg->se); kfree(tg->rt_rq); kfree(tg->rt_se); - kfree(tg); } -/* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(void) +static int alloc_rt_sched_group(struct task_group *tg) { - struct task_group *tg; - struct cfs_rq *cfs_rq; - struct sched_entity *se; struct rt_rq *rt_rq; struct sched_rt_entity *rt_se; struct rq *rq; int i; - tg = kzalloc(sizeof(*tg), GFP_KERNEL); - if (!tg) - return ERR_PTR(-ENOMEM); - - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); - if (!tg->cfs_rq) - goto err; - tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); - if (!tg->se) - goto err; tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); if (!tg->rt_rq) goto err; @@ -7590,22 +7534,11 @@ struct task_group *sched_create_group(void) if (!tg->rt_se) goto err; - tg->shares = NICE_0_LOAD; - tg->rt_ratio = 0; /* XXX */ + tg->rt_runtime = 0; for_each_possible_cpu(i) { rq = cpu_rq(i); - cfs_rq = kmalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); - if (!cfs_rq) - goto err; - - se = kmalloc_node(sizeof(struct sched_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); - if (!se) - goto err; - rt_rq = kmalloc_node(sizeof(struct rt_rq), GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!rt_rq) @@ -7616,20 +7549,75 @@ struct task_group *sched_create_group(void) if (!rt_se) goto err; - init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); } - lock_task_group_list(); + return 1; + + err: + return 0; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, + &cpu_rq(cpu)->leaf_rt_rq_list); +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); +} +#else +static inline void free_rt_sched_group(struct task_group *tg) +{ +} + +static inline int alloc_rt_sched_group(struct task_group *tg) +{ + return 1; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ +} +#endif + +static void free_sched_group(struct task_group *tg) +{ + free_fair_sched_group(tg); + free_rt_sched_group(tg); + kfree(tg); +} + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(void) +{ + struct task_group *tg; + unsigned long flags; + int i; + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + if (!alloc_fair_sched_group(tg)) + goto err; + + if (!alloc_rt_sched_group(tg)) + goto err; + + spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = tg->cfs_rq[i]; - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); - rt_rq = tg->rt_rq[i]; - list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); + register_fair_sched_group(tg, i); + register_rt_sched_group(tg, i); } list_add_rcu(&tg->list, &task_groups); - unlock_task_group_list(); + spin_unlock_irqrestore(&task_group_lock, flags); return tg; @@ -7648,21 +7636,16 @@ static void free_sched_group_rcu(struct rcu_head *rhp) /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { - struct cfs_rq *cfs_rq = NULL; - struct rt_rq *rt_rq = NULL; + unsigned long flags; int i; - lock_task_group_list(); + spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - rt_rq = tg->rt_rq[i]; - list_del_rcu(&rt_rq->leaf_rt_rq_list); + unregister_fair_sched_group(tg, i); + unregister_rt_sched_group(tg, i); } list_del_rcu(&tg->list); - unlock_task_group_list(); - - BUG_ON(!cfs_rq); + spin_unlock_irqrestore(&task_group_lock, flags); /* wait for possible concurrent references to cfs_rqs complete */ call_rcu(&tg->rcu, free_sched_group_rcu); @@ -7686,70 +7669,71 @@ void sched_move_task(struct task_struct *tsk) running = task_current(rq, tsk); on_rq = tsk->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, tsk, 0); - if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); - } + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); set_task_rq(tsk, task_cpu(tsk)); - if (on_rq) { - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); +#ifdef CONFIG_FAIR_GROUP_SCHED + if (tsk->sched_class->moved_group) + tsk->sched_class->moved_group(tsk); +#endif + + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + if (on_rq) enqueue_task(rq, tsk, 0); - } task_rq_unlock(rq, &flags); } -/* rq->lock to be locked by caller */ +#ifdef CONFIG_FAIR_GROUP_SCHED static void set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; struct rq *rq = cfs_rq->rq; int on_rq; - if (!shares) - shares = MIN_GROUP_SHARES; + spin_lock_irq(&rq->lock); on_rq = se->on_rq; - if (on_rq) { + if (on_rq) dequeue_entity(cfs_rq, se, 0); - dec_cpu_load(rq, se->load.weight); - } se->load.weight = shares; se->load.inv_weight = div64_64((1ULL<<32), shares); - if (on_rq) { + if (on_rq) enqueue_entity(cfs_rq, se, 0); - inc_cpu_load(rq, se->load.weight); - } + + spin_unlock_irq(&rq->lock); } +static DEFINE_MUTEX(shares_mutex); + int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - struct cfs_rq *cfs_rq; - struct rq *rq; + unsigned long flags; + + /* + * A weight of 0 or 1 can cause arithmetics problems. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ + if (shares < 2) + shares = 2; - lock_task_group_list(); + mutex_lock(&shares_mutex); if (tg->shares == shares) goto done; - if (shares < MIN_GROUP_SHARES) - shares = MIN_GROUP_SHARES; - - /* - * Prevent any load balance activity (rebalance_shares, - * load_balance_fair) from referring to this group first, - * by taking it off the rq->leaf_cfs_rq_list on each cpu. - */ - for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - } + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) + unregister_fair_sched_group(tg, i); + spin_unlock_irqrestore(&task_group_lock, flags); /* wait for any ongoing reference to this group to finish */ synchronize_sched(); @@ -7759,23 +7743,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * w/o tripping rebalance_share or load_balance_fair. */ tg->shares = shares; - for_each_possible_cpu(i) { - spin_lock_irq(&cpu_rq(i)->lock); + for_each_possible_cpu(i) set_se_shares(tg->se[i], shares); - spin_unlock_irq(&cpu_rq(i)->lock); - } /* * Enable load balance activity on this group, by inserting it back on * each cpu's rq->leaf_cfs_rq_list. */ - for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = tg->cfs_rq[i]; - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); - } + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) + register_fair_sched_group(tg, i); + spin_unlock_irqrestore(&task_group_lock, flags); done: - unlock_task_group_list(); + mutex_unlock(&shares_mutex); return 0; } @@ -7783,35 +7763,97 @@ unsigned long sched_group_shares(struct task_group *tg) { return tg->shares; } +#endif +#ifdef CONFIG_RT_GROUP_SCHED /* - * Ensure the total rt_ratio <= sysctl_sched_rt_ratio + * Ensure that the real time constraints are schedulable. */ -int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) +static DEFINE_MUTEX(rt_constraints_mutex); + +static unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 16; + + return div64_64(runtime << 16, period); +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { struct task_group *tgi; unsigned long total = 0; + unsigned long global_ratio = + to_ratio(sysctl_sched_rt_period, + sysctl_sched_rt_runtime < 0 ? + RUNTIME_INF : sysctl_sched_rt_runtime); rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) - total += tgi->rt_ratio; + list_for_each_entry_rcu(tgi, &task_groups, list) { + if (tgi == tg) + continue; + + total += to_ratio(period, tgi->rt_runtime); + } rcu_read_unlock(); - if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) - return -EINVAL; + return total + to_ratio(period, runtime) < global_ratio; +} - tg->rt_ratio = rt_ratio; +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) +{ + struct task_struct *g, *p; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); return 0; } -unsigned long sched_group_rt_ratio(struct task_group *tg) +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) { - return tg->rt_ratio; + u64 rt_runtime, rt_period; + int err = 0; + + rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us == -1) + rt_runtime = RUNTIME_INF; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { + err = -EBUSY; + goto unlock; + } + if (!__rt_schedulable(tg, rt_period, rt_runtime)) { + err = -EINVAL; + goto unlock; + } + tg->rt_runtime = rt_runtime; + unlock: + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return err; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} +#endif +#endif /* CONFIG_GROUP_SCHED */ -#ifdef CONFIG_FAIR_CGROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED /* return corresponding task_group object of a cgroup */ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) @@ -7857,9 +7899,15 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { +#ifdef CONFIG_RT_GROUP_SCHED + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) + return -EINVAL; +#else /* We don't support RT-tasks being in separate groups */ if (tsk->sched_class != &fair_sched_class) return -EINVAL; +#endif return 0; } @@ -7871,6 +7919,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, sched_move_task(tsk); } +#ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { @@ -7883,31 +7932,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } +#endif -static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_ratio_val) +#ifdef CONFIG_RT_GROUP_SCHED +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) { - return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); + char buffer[64]; + int retval = 0; + s64 val; + char *end; + + if (!nbytes) + return -EINVAL; + if (nbytes >= sizeof(buffer)) + return -E2BIG; + if (copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; + + buffer[nbytes] = 0; /* nul-terminate */ + + /* strip newline if necessary */ + if (nbytes && (buffer[nbytes-1] == '\n')) + buffer[nbytes-1] = 0; + val = simple_strtoll(buffer, &end, 0); + if (*end) + return -EINVAL; + + /* Pass to subsystem */ + retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val); + if (!retval) + retval = nbytes; + return retval; } -static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) +static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { - struct task_group *tg = cgroup_tg(cgrp); + char tmp[64]; + long val = sched_group_rt_runtime(cgroup_tg(cgrp)); + int len = sprintf(tmp, "%ld\n", val); - return (u64) tg->rt_ratio; + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } +#endif static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", .read_uint = cpu_shares_read_uint, .write_uint = cpu_shares_write_uint, }, +#endif +#ifdef CONFIG_RT_GROUP_SCHED { - .name = "rt_ratio", - .read_uint = cpu_rt_ratio_read_uint, - .write_uint = cpu_rt_ratio_write_uint, + .name = "rt_runtime_us", + .read = cpu_rt_runtime_read, + .write = cpu_rt_runtime_write, }, +#endif }; static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) @@ -7926,7 +8014,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { .early_init = 1, }; -#endif /* CONFIG_FAIR_CGROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_CGROUP_CPUACCT diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4b5e24cf2f4a..ef358ba07683 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.exec_start); PN(se.vruntime); PN(se.sum_exec_runtime); + PN(se.avg_overlap); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6c091d6e159d..0080968d3e4a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; /* * SCHED_OTHER wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * Maintain a cache of leftmost tree entries (it is frequently * used): */ - if (leftmost) + if (leftmost) { cfs_rq->rb_leftmost = &se->run_node; + /* + * maintain cfs_rq->min_vruntime to be a monotonic increasing + * value tracking the leftmost vruntime in the tree. + */ + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, se->vruntime); + } rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); @@ -184,8 +191,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) - cfs_rq->rb_leftmost = rb_next(&se->run_node); + if (cfs_rq->rb_leftmost == &se->run_node) { + struct rb_node *next_node; + struct sched_entity *next; + + next_node = rb_next(&se->run_node); + cfs_rq->rb_leftmost = next_node; + + if (next_node) { + next = rb_entry(next_node, + struct sched_entity, run_node); + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, + next->vruntime); + } + } + + if (cfs_rq->next == se) + cfs_rq->next = NULL; rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } @@ -202,17 +225,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct sched_entity *se = NULL; - struct rb_node *parent; + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); - while (*link) { - parent = *link; - se = rb_entry(parent, struct sched_entity, run_node); - link = &parent->rb_right; - } + if (!last) + return NULL; - return se; + return rb_entry(last, struct sched_entity, run_node); } /************************************************************** @@ -265,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running); - - slice *= se->load.weight; - do_div(slice, cfs_rq->load.weight); - - return slice; + return calc_delta_mine(__sched_period(cfs_rq->nr_running), + se->load.weight, &cfs_rq->load); } /* @@ -288,11 +302,6 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) return vslice; } -static u64 sched_vslice(struct cfs_rq *cfs_rq) -{ - return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); -} - static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { return __sched_vslice(cfs_rq->load.weight + se->load.weight, @@ -308,7 +317,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; - u64 vruntime; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); @@ -320,19 +328,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, &curr->load); } curr->vruntime += delta_exec_weighted; - - /* - * maintain cfs_rq->min_vruntime to be a monotonic increasing - * value tracking the leftmost vruntime in the tree. - */ - if (first_fair(cfs_rq)) { - vruntime = min_vruntime(curr->vruntime, - __pick_next_entity(cfs_rq)->vruntime); - } else - vruntime = curr->vruntime; - - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, vruntime); } static void update_curr(struct cfs_rq *cfs_rq) @@ -498,16 +493,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime; - vruntime = cfs_rq->min_vruntime; - - if (sched_feat(TREE_AVG)) { - struct sched_entity *last = __pick_last_entity(cfs_rq); - if (last) { - vruntime += last->vruntime; - vruntime >>= 1; - } - } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) - vruntime += sched_vslice(cfs_rq)/2; + if (first_fair(cfs_rq)) { + vruntime = min_vruntime(cfs_rq->min_vruntime, + __pick_next_entity(cfs_rq)->vruntime); + } else + vruntime = cfs_rq->min_vruntime; /* * The 'current' period is already promised to the current tasks, @@ -550,6 +540,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) account_entity_enqueue(cfs_rq, se); } +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + +static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (!se->last_wakeup) + return; + + update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); + se->last_wakeup = 0; +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -560,6 +565,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { + update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -621,12 +627,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static struct sched_entity * +pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 diff, gran; + + if (!cfs_rq->next) + return se; + + diff = cfs_rq->next->vruntime - se->vruntime; + if (diff < 0) + return se; + + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); + if (diff > gran) + return se; + + return cfs_rq->next; +} + static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = NULL; if (first_fair(cfs_rq)) { se = __pick_next_entity(cfs_rq); + se = pick_next(cfs_rq, se); set_next_entity(cfs_rq, se); } @@ -732,8 +758,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } -#define GROUP_IMBALANCE_PCT 20 - #else /* CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ @@ -824,26 +848,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, - *topse = NULL; /* Highest schedulable entity */ - int incload = 1; + struct sched_entity *se = &p->se; for_each_sched_entity(se) { - topse = se; - if (se->on_rq) { - incload = 0; + if (se->on_rq) break; - } cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, wakeup); wakeup = 1; } - /* Increment cpu load if we just enqueued the first task of a group on - * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs - * at the highest grouping level. - */ - if (incload) - inc_cpu_load(rq, topse->load.weight); hrtick_start_fair(rq, rq->curr); } @@ -856,28 +869,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) { struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, - *topse = NULL; /* Highest schedulable entity */ - int decload = 1; + struct sched_entity *se = &p->se; for_each_sched_entity(se) { - topse = se; cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, sleep); /* Don't dequeue parent if it has other entities besides us */ - if (cfs_rq->load.weight) { - if (parent_entity(se)) - decload = 0; + if (cfs_rq->load.weight) break; - } sleep = 1; } - /* Decrement cpu load if we just dequeued the last task of a group on - * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs - * at the highest grouping level. - */ - if (decload) - dec_cpu_load(rq, topse->load.weight); hrtick_start_fair(rq, rq->curr); } @@ -979,96 +980,121 @@ static inline int wake_idle(int cpu, struct task_struct *p) #endif #ifdef CONFIG_SMP -static int select_task_rq_fair(struct task_struct *p, int sync) + +static const struct sched_class fair_sched_class; + +static int +wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, + struct task_struct *p, int prev_cpu, int this_cpu, int sync, + int idx, unsigned long load, unsigned long this_load, + unsigned int imbalance) { - int cpu, this_cpu; - struct rq *rq; - struct sched_domain *sd, *this_sd = NULL; - int new_cpu; + struct task_struct *curr = this_rq->curr; + unsigned long tl = this_load; + unsigned long tl_per_task; + + if (!(this_sd->flags & SD_WAKE_AFFINE)) + return 0; + + /* + * If the currently running task will sleep within + * a reasonable amount of time then attract this newly + * woken task: + */ + if (sync && curr->sched_class == &fair_sched_class) { + if (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + return 1; + } - cpu = task_cpu(p); - rq = task_rq(p); - this_cpu = smp_processor_id(); - new_cpu = cpu; + schedstat_inc(p, se.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); - if (cpu == this_cpu) - goto out_set_cpu; + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || + 100*(tl + p->se.load.weight) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); + return 1; + } + return 0; +} + +static int select_task_rq_fair(struct task_struct *p, int sync) +{ + struct sched_domain *sd, *this_sd = NULL; + int prev_cpu, this_cpu, new_cpu; + unsigned long load, this_load; + struct rq *rq, *this_rq; + unsigned int imbalance; + int idx; + + prev_cpu = task_cpu(p); + rq = task_rq(p); + this_cpu = smp_processor_id(); + this_rq = cpu_rq(this_cpu); + new_cpu = prev_cpu; + + /* + * 'this_sd' is the first domain that both + * this_cpu and prev_cpu are present in: + */ for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpu_isset(prev_cpu, sd->span)) { this_sd = sd; break; } } if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) - goto out_set_cpu; + goto out; /* * Check for affine wakeup and passive balancing possibilities. */ - if (this_sd) { - int idx = this_sd->wake_idx; - unsigned int imbalance; - unsigned long load, this_load; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(cpu, idx); - this_load = target_load(this_cpu, idx); - - new_cpu = this_cpu; /* Wake to this CPU if we can */ - - if (this_sd->flags & SD_WAKE_AFFINE) { - unsigned long tl = this_load; - unsigned long tl_per_task; - - /* - * Attract cache-cold tasks on sync wakeups: - */ - if (sync && !task_hot(p, rq->clock, this_sd)) - goto out_set_cpu; - - schedstat_inc(p, se.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= current->se.load.weight; - - if ((tl <= load && - tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(this_sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); - goto out_set_cpu; - } - } + if (!this_sd) + goto out; - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - goto out_set_cpu; - } + idx = this_sd->wake_idx; + + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); + + if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + load, this_load, imbalance)) + return this_cpu; + + if (prev_cpu == this_cpu) + goto out; + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); + return this_cpu; } } - new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ -out_set_cpu: +out: return wake_idle(new_cpu, p); } #endif /* CONFIG_SMP */ @@ -1090,6 +1116,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) resched_task(curr); return; } + + se->last_wakeup = se->sum_exec_runtime; + if (unlikely(se == pse)) + return; + + cfs_rq_of(pse)->next = pse; + /* * Batch tasks do not preempt (their preemption is driven by * the tick): @@ -1191,6 +1224,25 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); } +#ifdef CONFIG_FAIR_GROUP_SCHED +static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr; + struct task_struct *p; + + if (!cfs_rq->nr_running || !first_fair(cfs_rq)) + return MAX_PRIO; + + curr = cfs_rq->curr; + if (!curr) + curr = __pick_next_entity(cfs_rq); + + p = task_of(curr); + + return p->prio; +} +#endif + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -1200,45 +1252,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; struct rq_iterator cfs_rq_iterator; - unsigned long load_moved; cfs_rq_iterator.start = load_balance_start_fair; cfs_rq_iterator.next = load_balance_next_fair; for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { #ifdef CONFIG_FAIR_GROUP_SCHED - struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; - unsigned long maxload, task_load, group_weight; - unsigned long thisload, per_task_load; - struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; - - task_load = busy_cfs_rq->load.weight; - group_weight = se->load.weight; + struct cfs_rq *this_cfs_rq; + long imbalance; + unsigned long maxload; - /* - * 'group_weight' is contributed by tasks of total weight - * 'task_load'. To move 'rem_load_move' worth of weight only, - * we need to move a maximum task load of: - * - * maxload = (remload / group_weight) * task_load; - */ - maxload = (rem_load_move * task_load) / group_weight; + this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); - if (!maxload || !task_load) + imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; + /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ + if (imbalance <= 0) continue; - per_task_load = task_load / busy_cfs_rq->nr_running; - /* - * balance_tasks will try to forcibly move atleast one task if - * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if - * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. - */ - if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) - continue; + /* Don't pull more than imbalance/2 */ + imbalance /= 2; + maxload = min(rem_load_move, imbalance); - /* Disable priority-based load balance */ - *this_best_prio = 0; - thisload = this_cfs_rq->load.weight; + *this_best_prio = cfs_rq_best_prio(this_cfs_rq); #else # define maxload rem_load_move #endif @@ -1247,33 +1282,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, * load_balance_[start|next]_fair iterators */ cfs_rq_iterator.arg = busy_cfs_rq; - load_moved = balance_tasks(this_rq, this_cpu, busiest, + rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, maxload, sd, idle, all_pinned, this_best_prio, &cfs_rq_iterator); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * load_moved holds the task load that was moved. The - * effective (group) weight moved would be: - * load_moved_eff = load_moved/task_load * group_weight; - */ - load_moved = (group_weight * load_moved) / task_load; - - /* Adjust shares on both cpus to reflect load_moved */ - group_weight -= load_moved; - set_se_shares(se, group_weight); - - se = busy_cfs_rq->tg->se[this_cpu]; - if (!thisload) - group_weight = load_moved; - else - group_weight = se->load.weight + load_moved; - set_se_shares(se, group_weight); -#endif - - rem_load_move -= load_moved; - if (rem_load_move <= 0) break; } @@ -1403,6 +1416,16 @@ static void set_curr_task_fair(struct rq *rq) set_next_entity(cfs_rq_of(se), se); } +#ifdef CONFIG_FAIR_GROUP_SCHED +static void moved_group_fair(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + update_curr(cfs_rq); + place_entity(cfs_rq, &p->se, 1); +} +#endif + /* * All the scheduling class methods: */ @@ -1431,6 +1454,10 @@ static const struct sched_class fair_sched_class = { .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, + +#ifdef CONFIG_FAIR_GROUP_SCHED + .moved_group = moved_group_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 274b40d7bef2..0a6d2e516420 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -55,14 +55,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) return !list_empty(&rt_se->run_list); } -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED -static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { if (!rt_rq->tg) - return SCHED_RT_FRAC; + return RUNTIME_INF; - return rt_rq->tg->rt_ratio; + return rt_rq->tg->rt_runtime; } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static void enqueue_rt_entity(struct sched_rt_entity *rt_se); static void dequeue_rt_entity(struct sched_rt_entity *rt_se); -static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) +static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { struct sched_rt_entity *rt_se = rt_rq->rt_se; @@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) } } -static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) +static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { struct sched_rt_entity *rt_se = rt_rq->rt_se; @@ -110,11 +110,31 @@ static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) dequeue_rt_entity(rt_se); } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} + +static int rt_se_boosted(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = group_rt_rq(rt_se); + struct task_struct *p; + + if (rt_rq) + return !!rt_rq->rt_nr_boosted; + + p = rt_task_of(rt_se); + return p->prio != p->normal_prio; +} + #else -static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - return sysctl_sched_rt_ratio; + if (sysctl_sched_rt_runtime == -1) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -141,19 +161,23 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return NULL; } -static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) +static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { } -static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) +static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} #endif static inline int rt_se_prio(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq) @@ -163,28 +187,26 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) return rt_task_of(rt_se)->prio; } -static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) +static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { - unsigned int rt_ratio = sched_rt_ratio(rt_rq); - u64 period, ratio; + u64 runtime = sched_rt_runtime(rt_rq); - if (rt_ratio == SCHED_RT_FRAC) + if (runtime == RUNTIME_INF) return 0; if (rt_rq->rt_throttled) - return 1; - - period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; - ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; + return rt_rq_throttled(rt_rq); - if (rt_rq->rt_time > ratio) { + if (rt_rq->rt_time > runtime) { struct rq *rq = rq_of_rt_rq(rt_rq); rq->rt_throttled = 1; rt_rq->rt_throttled = 1; - sched_rt_ratio_dequeue(rt_rq); - return 1; + if (rt_rq_throttled(rt_rq)) { + sched_rt_rq_dequeue(rt_rq); + return 1; + } } return 0; @@ -196,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq) u64 period; while (rq->clock > rq->rt_period_expire) { - period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; + period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; rq->rt_period_expire += period; for_each_leaf_rt_rq(rt_rq, rq) { - unsigned long rt_ratio = sched_rt_ratio(rt_rq); - u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; + u64 runtime = sched_rt_runtime(rt_rq); - rt_rq->rt_time -= min(rt_rq->rt_time, ratio); - if (rt_rq->rt_throttled) { + rt_rq->rt_time -= min(rt_rq->rt_time, runtime); + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { rt_rq->rt_throttled = 0; - sched_rt_ratio_enqueue(rt_rq); + sched_rt_rq_enqueue(rt_rq); } } @@ -239,12 +260,7 @@ static void update_curr_rt(struct rq *rq) cpuacct_charge(curr, delta_exec); rt_rq->rt_time += delta_exec; - /* - * might make it a tad more accurate: - * - * update_sched_rt_period(rq); - */ - if (sched_rt_ratio_exceeded(rt_rq)) + if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); } @@ -253,7 +269,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { WARN_ON(!rt_prio(rt_se_prio(rt_se))); rt_rq->rt_nr_running++; -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_se_prio(rt_se) < rt_rq->highest_prio) rt_rq->highest_prio = rt_se_prio(rt_se); #endif @@ -265,6 +281,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) update_rt_migration(rq_of_rt_rq(rt_rq)); #endif +#ifdef CONFIG_RT_GROUP_SCHED + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted++; +#endif } static inline @@ -273,7 +293,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running--; -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED if (rt_rq->rt_nr_running) { struct rt_prio_array *array; @@ -295,6 +315,12 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) update_rt_migration(rq_of_rt_rq(rt_rq)); #endif /* CONFIG_SMP */ +#ifdef CONFIG_RT_GROUP_SCHED + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted--; + + WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); +#endif } static void enqueue_rt_entity(struct sched_rt_entity *rt_se) @@ -303,7 +329,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) struct rt_prio_array *array = &rt_rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); - if (group_rq && group_rq->rt_throttled) + if (group_rq && rt_rq_throttled(group_rq)) return; list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); @@ -367,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) */ for_each_sched_rt_entity(rt_se) enqueue_rt_entity(rt_se); - - inc_cpu_load(rq, p->se.load.weight); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -388,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) if (rt_rq && rt_rq->rt_nr_running) enqueue_rt_entity(rt_se); } - - dec_cpu_load(rq, p->se.load.weight); } /* @@ -496,7 +518,7 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) if (unlikely(!rt_rq->rt_nr_running)) return NULL; - if (sched_rt_ratio_exceeded(rt_rq)) + if (rt_rq_throttled(rt_rq)) return NULL; do { @@ -1085,9 +1107,11 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p, pull_rt_task(rq); /* * If there's a higher priority task waiting to run - * then reschedule. + * then reschedule. Note, the above pull_rt_task + * can release the rq lock and p could migrate. + * Only reschedule if p is still on the same runqueue. */ - if (p->prio > rq->rt.highest_prio) + if (p->prio > rq->rt.highest_prio && rq->curr == p) resched_task(p); #else /* For UP simply resched on drop of prio */ diff --git a/kernel/semaphore.c b/kernel/semaphore.c new file mode 100644 index 000000000000..5c2942e768cd --- /dev/null +++ b/kernel/semaphore.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2008 Intel Corporation + * Author: Matthew Wilcox <willy@linux.intel.com> + * + * Distributed under the terms of the GNU GPL, version 2 + * + * This file implements counting semaphores. + * A counting semaphore may be acquired 'n' times before sleeping. + * See mutex.c for single-acquisition sleeping locks which enforce + * rules which allow code to be debugged more easily. + */ + +/* + * Some notes on the implementation: + * + * The spinlock controls access to the other members of the semaphore. + * down_trylock() and up() can be called from interrupt context, so we + * have to disable interrupts when taking the lock. It turns out various + * parts of the kernel expect to be able to use down() on a semaphore in + * interrupt context when they know it will succeed, so we have to use + * irqsave variants for down(), down_interruptible() and down_killable() + * too. + * + * The ->count variable represents how many more tasks can acquire this + * semaphore. If it's zero, there may be tasks waiting on the wait_list. + */ + +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/spinlock.h> + +static noinline void __down(struct semaphore *sem); +static noinline int __down_interruptible(struct semaphore *sem); +static noinline int __down_killable(struct semaphore *sem); +static noinline int __down_timeout(struct semaphore *sem, long jiffies); +static noinline void __up(struct semaphore *sem); + +/** + * down - acquire the semaphore + * @sem: the semaphore to be acquired + * + * Acquires the semaphore. If no more tasks are allowed to acquire the + * semaphore, calling this function will put the task to sleep until the + * semaphore is released. + * + * Use of this function is deprecated, please use down_interruptible() or + * down_killable() instead. + */ +void down(struct semaphore *sem) +{ + unsigned long flags; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + __down(sem); + spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(down); + +/** + * down_interruptible - acquire the semaphore unless interrupted + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a signal, this function will return -EINTR. + * If the semaphore is successfully acquired, this function returns 0. + */ +int down_interruptible(struct semaphore *sem) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_interruptible(sem); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_interruptible); + +/** + * down_killable - acquire the semaphore unless killed + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a fatal signal, this function will return + * -EINTR. If the semaphore is successfully acquired, this function returns + * 0. + */ +int down_killable(struct semaphore *sem) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_killable(sem); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_killable); + +/** + * down_trylock - try to acquire the semaphore, without waiting + * @sem: the semaphore to be acquired + * + * Try to acquire the semaphore atomically. Returns 0 if the mutex has + * been acquired successfully or 1 if it it cannot be acquired. + * + * NOTE: This return value is inverted from both spin_trylock and + * mutex_trylock! Be careful about this when converting code. + * + * Unlike mutex_trylock, this function can be used from interrupt context, + * and the semaphore can be released by any task or interrupt. + */ +int down_trylock(struct semaphore *sem) +{ + unsigned long flags; + int count; + + spin_lock_irqsave(&sem->lock, flags); + count = sem->count - 1; + if (likely(count >= 0)) + sem->count = count; + spin_unlock_irqrestore(&sem->lock, flags); + + return (count < 0); +} +EXPORT_SYMBOL(down_trylock); + +/** + * down_timeout - acquire the semaphore within a specified time + * @sem: the semaphore to be acquired + * @jiffies: how long to wait before failing + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the semaphore is not released within the specified number of jiffies, + * this function returns -ETIME. It returns 0 if the semaphore was acquired. + */ +int down_timeout(struct semaphore *sem, long jiffies) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_timeout(sem, jiffies); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_timeout); + +/** + * up - release the semaphore + * @sem: the semaphore to release + * + * Release the semaphore. Unlike mutexes, up() may be called from any + * context and even by tasks which have never called down(). + */ +void up(struct semaphore *sem) +{ + unsigned long flags; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(list_empty(&sem->wait_list))) + sem->count++; + else + __up(sem); + spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(up); + +/* Functions for the contended case */ + +struct semaphore_waiter { + struct list_head list; + struct task_struct *task; + int up; +}; + +/* + * Because this function is inlined, the 'state' parameter will be + * constant, and thus optimised away by the compiler. Likewise the + * 'timeout' parameter for the cases without timeouts. + */ +static inline int __sched __down_common(struct semaphore *sem, long state, + long timeout) +{ + struct task_struct *task = current; + struct semaphore_waiter waiter; + + list_add_tail(&waiter.list, &sem->wait_list); + waiter.task = task; + waiter.up = 0; + + for (;;) { + if (state == TASK_INTERRUPTIBLE && signal_pending(task)) + goto interrupted; + if (state == TASK_KILLABLE && fatal_signal_pending(task)) + goto interrupted; + if (timeout <= 0) + goto timed_out; + __set_task_state(task, state); + spin_unlock_irq(&sem->lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&sem->lock); + if (waiter.up) + return 0; + } + + timed_out: + list_del(&waiter.list); + return -ETIME; + + interrupted: + list_del(&waiter.list); + return -EINTR; +} + +static noinline void __sched __down(struct semaphore *sem) +{ + __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_interruptible(struct semaphore *sem) +{ + return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_killable(struct semaphore *sem) +{ + return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) +{ + return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); +} + +static noinline void __sched __up(struct semaphore *sem) +{ + struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, + struct semaphore_waiter, list); + list_del(&waiter->list); + waiter->up = 1; + wake_up_process(waiter->task); +} diff --git a/kernel/signal.c b/kernel/signal.c index 2c1f08defac2..cc8303cd093d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -972,7 +972,7 @@ void zap_other_threads(struct task_struct *p) } } -int fastcall __fatal_signal_pending(struct task_struct *tsk) +int __fatal_signal_pending(struct task_struct *tsk) { return sigismember(&tsk->pending.signal, SIGKILL); } @@ -1623,7 +1623,6 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) /* Let the debugger run. */ __set_current_state(TASK_TRACED); spin_unlock_irq(¤t->sighand->siglock); - try_to_freeze(); read_lock(&tasklist_lock); if (!unlikely(killed) && may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); @@ -1641,6 +1640,13 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) } /* + * While in TASK_TRACED, we were considered "frozen enough". + * Now that we woke up, it's crucial if we're supposed to be + * frozen that we freeze now before running anything substantial. + */ + try_to_freeze(); + + /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with * any signal-sending on another CPU that wants to examine it. @@ -1751,15 +1757,60 @@ static int do_signal_stop(int signr) return 1; } +static int ptrace_signal(int signr, siginfo_t *info, + struct pt_regs *regs, void *cookie) +{ + if (!(current->ptrace & PT_PTRACED)) + return signr; + + ptrace_signal_deliver(regs, cookie); + + /* Let the debugger run. */ + ptrace_stop(signr, 0, info); + + /* We're back. Did the debugger cancel the sig? */ + signr = current->exit_code; + if (signr == 0) + return signr; + + current->exit_code = 0; + + /* Update the siginfo structure if the signal has + changed. If the debugger wanted something + specific in the siginfo structure then it should + have updated *info via PTRACE_SETSIGINFO. */ + if (signr != info->si_signo) { + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_pid_vnr(current->parent); + info->si_uid = current->parent->uid; + } + + /* If the (new) signal is now blocked, requeue it. */ + if (sigismember(¤t->blocked, signr)) { + specific_send_sig_info(signr, info, current); + signr = 0; + } + + return signr; +} + int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie) { sigset_t *mask = ¤t->blocked; int signr = 0; +relock: + /* + * We'll jump back here after any time we were stopped in TASK_STOPPED. + * While in TASK_STOPPED, we were considered "frozen enough". + * Now that we woke up, it's crucial if we're supposed to be + * frozen that we freeze now before running anything substantial. + */ try_to_freeze(); -relock: spin_lock_irq(¤t->sighand->siglock); for (;;) { struct k_sigaction *ka; @@ -1773,36 +1824,10 @@ relock: if (!signr) break; /* will return 0 */ - if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { - ptrace_signal_deliver(regs, cookie); - - /* Let the debugger run. */ - ptrace_stop(signr, 0, info); - - /* We're back. Did the debugger cancel the sig? */ - signr = current->exit_code; - if (signr == 0) - continue; - - current->exit_code = 0; - - /* Update the siginfo structure if the signal has - changed. If the debugger wanted something - specific in the siginfo structure then it should - have updated *info via PTRACE_SETSIGINFO. */ - if (signr != info->si_signo) { - info->si_signo = signr; - info->si_errno = 0; - info->si_code = SI_USER; - info->si_pid = task_pid_vnr(current->parent); - info->si_uid = current->parent->uid; - } - - /* If the (new) signal is now blocked, requeue it. */ - if (sigismember(¤t->blocked, signr)) { - specific_send_sig_info(signr, info, current); + if (signr != SIGKILL) { + signr = ptrace_signal(signr, info, regs, cookie); + if (!signr) continue; - } } ka = ¤t->sighand->action[signr-1]; diff --git a/kernel/softirq.c b/kernel/softirq.c index 5b3aea5f471e..31e9f2a47928 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -313,6 +313,7 @@ void irq_exit(void) /* Make sure that timer wheel updates are propagated */ if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) tick_nohz_stop_sched_tick(); + rcu_irq_exit(); #endif preempt_enable_no_resched(); } diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 7c2da88db4ed..01b6522fd92b 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -216,26 +216,27 @@ static int watchdog(void *__bind_cpu) /* initialize timestamp */ touch_softlockup_watchdog(); + set_current_state(TASK_INTERRUPTIBLE); /* * Run briefly once per second to reset the softlockup timestamp. * If this gets delayed for more than 60 seconds then the * debug-printout triggers in softlockup_tick(). */ while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); touch_softlockup_watchdog(); schedule(); if (kthread_should_stop()) break; - if (this_cpu != check_cpu) - continue; - - if (sysctl_hung_task_timeout_secs) - check_hung_uninterruptible_tasks(this_cpu); + if (this_cpu == check_cpu) { + if (sysctl_hung_task_timeout_secs) + check_hung_uninterruptible_tasks(this_cpu); + } + set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d41ef6b4cf72..b2a2d6889bab 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -311,9 +311,10 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_rt_period_ms", + .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, .maxlen = sizeof(unsigned int), .mode = 0644, @@ -321,31 +322,12 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_rt_ratio", - .data = &sysctl_sched_rt_ratio, - .maxlen = sizeof(unsigned int), + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_min_bal_int_shares", - .data = &sysctl_sched_min_bal_int_shares, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_max_bal_int_shares", - .data = &sysctl_sched_max_bal_int_shares, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif -#endif { .ctl_name = CTL_UNNUMBERED, .procname = "sched_compat_yield", @@ -978,8 +960,8 @@ static struct ctl_table vm_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nr_overcommit_hugepages", - .data = &nr_overcommit_huge_pages, - .maxlen = sizeof(nr_overcommit_huge_pages), + .data = &sysctl_overcommit_huge_pages, + .maxlen = sizeof(sysctl_overcommit_huge_pages), .mode = 0644, .proc_handler = &hugetlb_overcommit_handler, }, diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 548c436a776b..f61402b1f2d0 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -141,13 +141,8 @@ static void clocksource_watchdog(unsigned long data) } if (!list_empty(&watchdog_list)) { - /* Cycle through CPUs to check if the CPUs stay synchronized to - * each other. */ - int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); - if (next_cpu >= NR_CPUS) - next_cpu = first_cpu(cpu_online_map); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + __mod_timer(&watchdog_timer, + watchdog_timer.expires + WATCHDOG_INTERVAL); } spin_unlock(&watchdog_lock); } @@ -169,7 +164,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (!started && watchdog) { watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, first_cpu(cpu_online_map)); + add_timer(&watchdog_timer); } } else { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) @@ -179,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (watchdog) del_timer(&watchdog_timer); watchdog = cs; - init_timer_deferrable(&watchdog_timer); + init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; /* Reset watchdog cycles */ @@ -190,8 +185,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + add_timer(&watchdog_timer); } } } @@ -228,6 +222,18 @@ void clocksource_resume(void) } /** + * clocksource_touch_watchdog - Update watchdog + * + * Update the watchdog after exception contexts such as kgdb so as not + * to incorrectly trip the watchdog. + * + */ +void clocksource_touch_watchdog(void) +{ + clocksource_resume_watchdog(); +} + +/** * clocksource_get_next - Returns the selected clocksource * */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c88b5910e7ab..5fd9b9469770 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ long time_freq; /* frequency offset (scaled ppm)*/ static long time_reftime; /* time at last adjustment (s) */ long time_adjust; +static long ntp_tick_adj; static void ntp_update_frequency(void) { u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; - second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT; second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); tick_length_base = second_length; @@ -342,14 +343,16 @@ int do_adjtimex(struct timex *txc) freq_adj = shift_right(freq_adj, time_constant * 2 + (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { + u64 utemp64; temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL); if (time_offset < 0) { - temp64 = -temp64; - do_div(temp64, mtemp); - freq_adj -= temp64; + utemp64 = -temp64; + do_div(utemp64, mtemp); + freq_adj -= utemp64; } else { - do_div(temp64, mtemp); - freq_adj += temp64; + utemp64 = temp64; + do_div(utemp64, mtemp); + freq_adj += utemp64; } } freq_adj += time_freq; @@ -400,3 +403,11 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) notify_cmos_timer(); return(result); } + +static int __init ntp_tick_adj_setup(char *str) +{ + ntp_tick_adj = simple_strtol(str, NULL, 0); + return 1; +} + +__setup("ntp_tick_adj=", ntp_tick_adj_setup); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e1bd50cbbf5d..fdfa0c745bb6 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -14,7 +14,7 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> -#include <linux/irq.h> +#include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 1bea399a9ef0..4f3886562b8c 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -14,12 +14,14 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> -#include <linux/irq.h> +#include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> #include <linux/tick.h> +#include <asm/irq_regs.h> + #include "tick-internal.h" /* diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0258d3115d54..450c04935b66 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -14,7 +14,7 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> -#include <linux/irq.h> +#include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa9bb73dbdb4..686da821d376 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void) ts->idle_tick = ts->sched_timer.expires; ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; + rcu_enter_nohz(); } /* @@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void) return; } + rcu_exit_nohz(); + /* Update jiffies first */ select_nohz_load_balancer(0); now = ktime_get(); @@ -637,7 +640,7 @@ void tick_cancel_sched_timer(int cpu) if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); - ts->tick_stopped = 0; + ts->nohz_mode = NOHZ_MODE_INACTIVE; } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1af9fb050fe2..a3fa587c350c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -187,13 +187,16 @@ static void change_clocksource(void) clock->error = 0; clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, - (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); tick_clock_notify(); + /* + * We're holding xtime lock and waking up klogd would deadlock + * us on enqueue. So no printing! printk(KERN_INFO "Time: %s clocksource has been installed.\n", clock->name); + */ } #else static inline void change_clocksource(void) { } @@ -245,8 +248,7 @@ void __init timekeeping_init(void) ntp_clear(); clock = clocksource_get_next(); - clocksource_calculate_interval(clock, - (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); clock->cycle_last = clocksource_read(clock); xtime.tv_sec = sec; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index d3d94c1a0fd2..67fe8fc21fb1 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -65,9 +65,9 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); #endif SEQ_printf(m, "\n"); - SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n", + SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", (unsigned long long)ktime_to_ns(timer->expires), - (unsigned long long)(ktime_to_ns(timer->expires) - now)); + (long long)(ktime_to_ns(timer->expires) - now)); } static void diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl index 62b1287932ed..41468035473c 100644 --- a/kernel/timeconst.pl +++ b/kernel/timeconst.pl @@ -339,7 +339,7 @@ sub output($@) print "\n"; foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', - 'USEC_TO_HZ','HZ_TO_USEC') { + 'HZ_TO_USEC','USEC_TO_HZ') { foreach $bit (32, 64) { foreach $suf ('MUL', 'ADJ', 'SHR') { printf "#define %-23s %s\n", diff --git a/kernel/timer.c b/kernel/timer.c index 99b00a25f88b..b024106daa70 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); internal_add_timer(base, timer); + /* + * Check whether the other CPU is idle and needs to be + * triggered to reevaluate the timer wheel when nohz is + * active. We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to idle can not evaluate + * the timer wheel. + */ + wake_up_idle_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } - /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified diff --git a/kernel/uid16.c b/kernel/uid16.c index dd308ba4e03b..3e41c1673e2f 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -21,7 +21,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi { long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, filename, user, group); return ret; } @@ -29,7 +29,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g { long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, filename, user, group); return ret; } @@ -37,7 +37,7 @@ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) { long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, fd, user, group); return ret; } @@ -45,7 +45,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) { long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(2, ret, rgid, egid); return ret; } @@ -53,7 +53,7 @@ asmlinkage long sys_setgid16(old_gid_t gid) { long ret = sys_setgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, gid); return ret; } @@ -61,7 +61,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) { long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(2, ret, ruid, euid); return ret; } @@ -69,7 +69,7 @@ asmlinkage long sys_setuid16(old_uid_t uid) { long ret = sys_setuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, uid); return ret; } @@ -78,7 +78,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), low2highuid(suid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, ruid, euid, suid); return ret; } @@ -98,7 +98,7 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), low2highgid(sgid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, rgid, egid, sgid); return ret; } @@ -117,7 +117,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid) { long ret = sys_setfsuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, uid); return ret; } @@ -125,7 +125,7 @@ asmlinkage long sys_setfsgid16(old_gid_t gid) { long ret = sys_setfsgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, gid); return ret; } diff --git a/kernel/user.c b/kernel/user.c index 7d7900c5a1fd..7132022a040c 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -57,7 +57,7 @@ struct user_struct root_user = { .uid_keyring = &root_user_keyring, .session_keyring = &root_session_keyring, #endif -#ifdef CONFIG_FAIR_USER_SCHED +#ifdef CONFIG_USER_SCHED .tg = &init_task_group, #endif }; @@ -90,7 +90,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) return NULL; } -#ifdef CONFIG_FAIR_USER_SCHED +#ifdef CONFIG_USER_SCHED static void sched_destroy_user(struct user_struct *up) { @@ -113,15 +113,15 @@ static void sched_switch_user(struct task_struct *p) sched_move_task(p); } -#else /* CONFIG_FAIR_USER_SCHED */ +#else /* CONFIG_USER_SCHED */ static void sched_destroy_user(struct user_struct *up) { } static int sched_create_user(struct user_struct *up) { return 0; } static void sched_switch_user(struct task_struct *p) { } -#endif /* CONFIG_FAIR_USER_SCHED */ +#endif /* CONFIG_USER_SCHED */ -#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) +#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ static DEFINE_MUTEX(uids_mutex); @@ -137,6 +137,7 @@ static inline void uids_mutex_unlock(void) } /* uid directory attributes */ +#ifdef CONFIG_FAIR_GROUP_SCHED static ssize_t cpu_shares_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -163,10 +164,45 @@ static ssize_t cpu_shares_store(struct kobject *kobj, static struct kobj_attribute cpu_share_attr = __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static ssize_t cpu_rt_runtime_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + + return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); +} + +static ssize_t cpu_rt_runtime_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t size) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + unsigned long rt_runtime; + int rc; + + sscanf(buf, "%lu", &rt_runtime); + + rc = sched_group_set_rt_runtime(up->tg, rt_runtime); + + return (rc ? rc : size); +} + +static struct kobj_attribute cpu_rt_runtime_attr = + __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); +#endif /* default attributes per uid directory */ static struct attribute *uids_attributes[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED &cpu_share_attr.attr, +#endif +#ifdef CONFIG_RT_GROUP_SCHED + &cpu_rt_runtime_attr.attr, +#endif NULL }; @@ -269,7 +305,7 @@ static inline void free_user(struct user_struct *up, unsigned long flags) schedule_work(&up->work); } -#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ +#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ int uids_sysfs_init(void) { return 0; } static inline int uids_user_create(struct user_struct *up) { return 0; } @@ -373,7 +409,7 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - /* This case is not possible when CONFIG_FAIR_USER_SCHED + /* This case is not possible when CONFIG_USER_SCHED * is defined, since we serialize alloc_uid() using * uids_mutex. Hence no need to call * sched_destroy_user() or remove_user_sysfs_dir(). |