[PATCH] RCU signal handling
RCU tasklist_lock and RCU signal handling: send signals RCU-read-locked
instead of tasklist_lock read-locked. This is a scalability improvement on
SMP and a preemption-latency improvement under PREEMPT_RCU.
Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: William Irwin <wli@holomorphy.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/kernel/exit.c b/kernel/exit.c
index ee51568..c73a7eb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,7 +72,6 @@
__ptrace_unlink(p);
BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
__exit_signal(p);
- __exit_sighand(p);
/*
* Note that the fastpath in sys_times depends on __exit_signal having
* updated the counters before a task is removed from the tasklist of
diff --git a/kernel/fork.c b/kernel/fork.c
index fb8572a..7fe3adf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -743,6 +743,14 @@
EXPORT_SYMBOL(unshare_files);
+void sighand_free_cb(struct rcu_head *rhp)
+{
+ struct sighand_struct *sp;
+
+ sp = container_of(rhp, struct sighand_struct, rcu);
+ kmem_cache_free(sighand_cachep, sp);
+}
+
static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
{
struct sighand_struct *sig;
@@ -752,7 +760,7 @@
return 0;
}
sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
- tsk->sighand = sig;
+ rcu_assign_pointer(tsk->sighand, sig);
if (!sig)
return -ENOMEM;
spin_lock_init(&sig->siglock);
diff --git a/kernel/pid.c b/kernel/pid.c
index edba31c..1acc072 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -136,7 +136,7 @@
struct hlist_node *elem;
struct pid *pid;
- hlist_for_each_entry(pid, elem,
+ hlist_for_each_entry_rcu(pid, elem,
&pid_hash[type][pid_hashfn(nr)], pid_chain) {
if (pid->nr == nr)
return pid;
@@ -150,15 +150,15 @@
task_pid = &task->pids[type];
pid = find_pid(type, nr);
+ task_pid->nr = nr;
if (pid == NULL) {
- hlist_add_head(&task_pid->pid_chain,
- &pid_hash[type][pid_hashfn(nr)]);
INIT_LIST_HEAD(&task_pid->pid_list);
+ hlist_add_head_rcu(&task_pid->pid_chain,
+ &pid_hash[type][pid_hashfn(nr)]);
} else {
INIT_HLIST_NODE(&task_pid->pid_chain);
- list_add_tail(&task_pid->pid_list, &pid->pid_list);
+ list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
}
- task_pid->nr = nr;
return 0;
}
@@ -170,20 +170,20 @@
pid = &task->pids[type];
if (!hlist_unhashed(&pid->pid_chain)) {
- hlist_del(&pid->pid_chain);
- if (list_empty(&pid->pid_list))
+ if (list_empty(&pid->pid_list)) {
nr = pid->nr;
- else {
+ hlist_del_rcu(&pid->pid_chain);
+ } else {
pid_next = list_entry(pid->pid_list.next,
struct pid, pid_list);
/* insert next pid from pid_list to hash */
- hlist_add_head(&pid_next->pid_chain,
- &pid_hash[type][pid_hashfn(pid_next->nr)]);
+ hlist_replace_rcu(&pid->pid_chain,
+ &pid_next->pid_chain);
}
}
- list_del(&pid->pid_list);
+ list_del_rcu(&pid->pid_list);
pid->nr = 0;
return nr;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c9afc61..0a669bd 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -35,6 +35,7 @@
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
+#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <asm/atomic.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f46c94..9273309 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -176,6 +176,13 @@
#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
< (long long) (sd)->cache_hot_time)
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+ __put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
/*
* These are the runqueue data structures:
*/
diff --git a/kernel/signal.c b/kernel/signal.c
index d7611f1..64737c7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -329,13 +329,20 @@
/* Ok, we're done with the signal handlers */
tsk->sighand = NULL;
if (atomic_dec_and_test(&sighand->count))
- kmem_cache_free(sighand_cachep, sighand);
+ sighand_free(sighand);
}
void exit_sighand(struct task_struct *tsk)
{
write_lock_irq(&tasklist_lock);
- __exit_sighand(tsk);
+ rcu_read_lock();
+ if (tsk->sighand != NULL) {
+ struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
+ spin_lock(&sighand->siglock);
+ __exit_sighand(tsk);
+ spin_unlock(&sighand->siglock);
+ }
+ rcu_read_unlock();
write_unlock_irq(&tasklist_lock);
}
@@ -345,12 +352,14 @@
void __exit_signal(struct task_struct *tsk)
{
struct signal_struct * sig = tsk->signal;
- struct sighand_struct * sighand = tsk->sighand;
+ struct sighand_struct * sighand;
if (!sig)
BUG();
if (!atomic_read(&sig->count))
BUG();
+ rcu_read_lock();
+ sighand = rcu_dereference(tsk->sighand);
spin_lock(&sighand->siglock);
posix_cpu_timers_exit(tsk);
if (atomic_dec_and_test(&sig->count)) {
@@ -358,6 +367,7 @@
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
tsk->signal = NULL;
+ __exit_sighand(tsk);
spin_unlock(&sighand->siglock);
flush_sigqueue(&sig->shared_pending);
} else {
@@ -389,9 +399,11 @@
sig->nvcsw += tsk->nvcsw;
sig->nivcsw += tsk->nivcsw;
sig->sched_time += tsk->sched_time;
+ __exit_sighand(tsk);
spin_unlock(&sighand->siglock);
sig = NULL; /* Marker for below. */
}
+ rcu_read_unlock();
clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
flush_sigqueue(&tsk->pending);
if (sig) {
@@ -1080,18 +1092,28 @@
}
/*
- * Must be called with the tasklist_lock held for reading!
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
*/
int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
unsigned long flags;
+ struct sighand_struct *sp;
int ret;
+retry:
ret = check_kill_permission(sig, info, p);
- if (!ret && sig && p->sighand) {
- spin_lock_irqsave(&p->sighand->siglock, flags);
+ if (!ret && sig && (sp = p->sighand)) {
+ if (!get_task_struct_rcu(p))
+ return -ESRCH;
+ spin_lock_irqsave(&sp->siglock, flags);
+ if (p->sighand != sp) {
+ spin_unlock_irqrestore(&sp->siglock, flags);
+ put_task_struct(p);
+ goto retry;
+ }
ret = __group_send_sig_info(sig, info, p);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ spin_unlock_irqrestore(&sp->siglock, flags);
+ put_task_struct(p);
}
return ret;
@@ -1136,14 +1158,21 @@
kill_proc_info(int sig, struct siginfo *info, pid_t pid)
{
int error;
+ int acquired_tasklist_lock = 0;
struct task_struct *p;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
+ if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+ read_lock(&tasklist_lock);
+ acquired_tasklist_lock = 1;
+ }
p = find_task_by_pid(pid);
error = -ESRCH;
if (p)
error = group_send_sig_info(sig, info, p);
- read_unlock(&tasklist_lock);
+ if (unlikely(acquired_tasklist_lock))
+ read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return error;
}
@@ -1355,16 +1384,54 @@
{
unsigned long flags;
int ret = 0;
+ struct sighand_struct *sh;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
- read_lock(&tasklist_lock);
+
+ /*
+ * The rcu based delayed sighand destroy makes it possible to
+ * run this without tasklist lock held. The task struct itself
+ * cannot go away as create_timer did get_task_struct().
+ *
+ * We return -1, when the task is marked exiting, so
+ * posix_timer_event can redirect it to the group leader
+ */
+ rcu_read_lock();
if (unlikely(p->flags & PF_EXITING)) {
ret = -1;
goto out_err;
}
- spin_lock_irqsave(&p->sighand->siglock, flags);
+retry:
+ sh = rcu_dereference(p->sighand);
+
+ spin_lock_irqsave(&sh->siglock, flags);
+ if (p->sighand != sh) {
+ /* We raced with exec() in a multithreaded process... */
+ spin_unlock_irqrestore(&sh->siglock, flags);
+ goto retry;
+ }
+
+ /*
+ * We do the check here again to handle the following scenario:
+ *
+ * CPU 0 CPU 1
+ * send_sigqueue
+ * check PF_EXITING
+ * interrupt exit code running
+ * __exit_signal
+ * lock sighand->siglock
+ * unlock sighand->siglock
+ * lock sh->siglock
+ * add(tsk->pending) flush_sigqueue(tsk->pending)
+ *
+ */
+
+ if (unlikely(p->flags & PF_EXITING)) {
+ ret = -1;
+ goto out;
+ }
if (unlikely(!list_empty(&q->list))) {
/*
@@ -1388,9 +1455,9 @@
signal_wake_up(p, sig == SIGKILL);
out:
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ spin_unlock_irqrestore(&sh->siglock, flags);
out_err:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return ret;
}
@@ -1402,7 +1469,9 @@
int ret = 0;
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
+
read_lock(&tasklist_lock);
+ /* Since it_lock is held, p->sighand cannot be NULL. */
spin_lock_irqsave(&p->sighand->siglock, flags);
handle_stop_signal(sig, p);
@@ -1436,7 +1505,7 @@
out:
spin_unlock_irqrestore(&p->sighand->siglock, flags);
read_unlock(&tasklist_lock);
- return(ret);
+ return ret;
}
/*