sched: fix overload performance: buddy wakeups Currently we schedule to the leftmost task in the runqueue. When the runtimes are very short because of some server/client ping-pong, especially in over-saturated workloads, this will cycle through all tasks trashing the cache. Reduce cache trashing by keeping dependent tasks together by running newly woken tasks first. However, by not running the leftmost task first we could starve tasks because the wakee can gain unlimited runtime. Therefore we only run the wakee if its within a small (wakeup_granularity) window of the leftmost task. This preserves fairness, but does alternate server/client task groups. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>

commit: aa2ac25229cd4d0280f6174c42712744ad61b140 [log] [tgz]
author: Peter Zijlstra <a.p.zijlstra@chello.nl> Fri Mar 14 21:12:12 2008 +0100
committer: Ingo Molnar <mingo@elte.hu> Sat Mar 15 03:02:50 2008 +0100
tree: e4450de1bb2cd4cd56d6abf64feb862c1d542653
parent: 27d117266097101dcf79c4576903cdcdd0eabffc [diff] [blame]
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9d003c9..31c4a29 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c

@@ -207,6 +207,9 @@
 		}
 	}
 
+	if (cfs_rq->next == se)
+		cfs_rq->next = NULL;
+
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
 
@@ -626,12 +629,32 @@
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
+static struct sched_entity *
+pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	s64 diff, gran;
+
+	if (!cfs_rq->next)
+		return se;
+
+	diff = cfs_rq->next->vruntime - se->vruntime;
+	if (diff < 0)
+		return se;
+
+	gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
+	if (diff > gran)
+		return se;
+
+	return cfs_rq->next;
+}
+
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *se = NULL;
 
 	if (first_fair(cfs_rq)) {
 		se = __pick_next_entity(cfs_rq);
+		se = pick_next(cfs_rq, se);
 		set_next_entity(cfs_rq, se);
 	}
 
@@ -1070,6 +1093,9 @@
 		resched_task(curr);
 		return;
 	}
+
+	cfs_rq_of(pse)->next = pse;
+
 	/*
 	 * Batch tasks do not preempt (their preemption is driven by
 	 * the tick):
commit	aa2ac25229cd4d0280f6174c42712744ad61b140	[log] [tgz]
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	Fri Mar 14 21:12:12 2008 +0100
committer	Ingo Molnar <mingo@elte.hu>	Sat Mar 15 03:02:50 2008 +0100
tree	e4450de1bb2cd4cd56d6abf64feb862c1d542653
parent	27d117266097101dcf79c4576903cdcdd0eabffc [diff] [blame]