Speed up divides by cpu_power in scheduler
I noticed expensive divides done in try_to_wakeup() and
find_busiest_group() on a bi dual core Opteron machine (total of 4 cores),
moderatly loaded (15.000 context switch per second)
oprofile numbers :
CPU: AMD64 processors, speed 2600.05 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit
mask of 0x00 (No unit mask) count 50000
samples % symbol name
...
613914 1.0498 try_to_wake_up
834 0.0013 :ffffffff80227ae1: div %rcx
77513 0.1191 :ffffffff80227ae4: mov %rax,%r11
608893 1.0413 find_busiest_group
1841 0.0031 :ffffffff802260bf: div %rdi
140109 0.2394 :ffffffff802260c2: test %sil,%sil
Some of these divides can use the reciprocal divides we introduced some
time ago (currently used in slab AFAIK)
We can assume a load will fit in a 32bits number, because with a
SCHED_LOAD_SCALE=128 value, its still a theorical limit of 33554432
When/if we reach this limit one day, probably cpus will have a fast
hardware divide and we can zap the reciprocal divide trick.
Ingo suggested to rename cpu_power to __cpu_power to make clear it should
not be modified without changing its reciprocal value too.
I did not convert the divide in cpu_avg_load_per_task(), because tracking
nr_running changes may be not worth it ? We could use a static table of 32
reciprocal values but it would add a conditional branch and table lookup.
[akpm@linux-foundation.org: !SMP build fix]
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/kernel/sched.c b/kernel/sched.c
index 7459928..e4a5888 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -52,8 +52,9 @@
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
-#include <asm/tlb.h>
+#include <linux/reciprocal_div.h>
+#include <asm/tlb.h>
#include <asm/unistd.h>
/*
@@ -181,6 +182,27 @@
return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
+#ifdef CONFIG_SMP
+/*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+ return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+ sg->__cpu_power += val;
+ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+#endif
+
/*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms]
@@ -1256,7 +1278,8 @@
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
if (local_group) {
this_load = avg_load;
@@ -2367,12 +2390,13 @@
}
total_load += avg_load;
- total_pwr += group->cpu_power;
+ total_pwr += group->__cpu_power;
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
- group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+ group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
if (local_group) {
this_load = avg_load;
@@ -2483,8 +2507,8 @@
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->cpu_power,
- (avg_load - this_load) * this->cpu_power)
+ *imbalance = min(max_pull * busiest->__cpu_power,
+ (avg_load - this_load) * this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
@@ -2518,28 +2542,29 @@
* moving them.
*/
- pwr_now += busiest->cpu_power *
- min(busiest_load_per_task, max_load);
- pwr_now += this->cpu_power *
- min(this_load_per_task, this_load);
+ pwr_now += busiest->__cpu_power *
+ min(busiest_load_per_task, max_load);
+ pwr_now += this->__cpu_power *
+ min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- busiest->cpu_power;
+ tmp = sg_div_cpu_power(busiest,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
if (max_load > tmp)
- pwr_move += busiest->cpu_power *
+ pwr_move += busiest->__cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
- if (max_load * busiest->cpu_power <
+ if (max_load * busiest->__cpu_power <
busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = max_load * busiest->cpu_power / this->cpu_power;
+ tmp = sg_div_cpu_power(this,
+ max_load * busiest->__cpu_power);
else
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- this->cpu_power;
- pwr_move += this->cpu_power *
- min(this_load_per_task, this_load + tmp);
+ tmp = sg_div_cpu_power(this,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += this->__cpu_power *
+ min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
@@ -5501,7 +5526,7 @@
break;
}
- if (!group->cpu_power) {
+ if (!group->__cpu_power) {
printk("\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -5678,7 +5703,7 @@
continue;
sg->cpumask = CPU_MASK_NONE;
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
for_each_cpu_mask(j, span) {
if (group_fn(j, cpu_map, NULL) != group)
@@ -6367,7 +6392,7 @@
continue;
}
- sg->cpu_power += sd->groups->cpu_power;
+ sg_inc_cpu_power(sg, sd->groups->__cpu_power);
}
sg = sg->next;
if (sg != group_head)
@@ -6442,6 +6467,8 @@
child = sd->child;
+ sd->groups->__cpu_power = 0;
+
/*
* For perf policy, if the groups in child domain share resources
* (for example cores sharing some portions of the cache hierarchy
@@ -6452,18 +6479,16 @@
if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
(child->flags &
(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
return;
}
- sd->groups->cpu_power = 0;
-
/*
* add cpu_power of each child group to this groups cpu_power
*/
group = child->groups;
do {
- sd->groups->cpu_power += group->cpu_power;
+ sg_inc_cpu_power(sd->groups, group->__cpu_power);
group = group->next;
} while (group != child->groups);
}
@@ -6623,7 +6648,7 @@
sd = &per_cpu(node_domains, j);
sd->groups = sg;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = nodemask;
sg->next = sg;
cpus_or(covered, covered, nodemask);
@@ -6651,7 +6676,7 @@
"Can not alloc domain group for node %d\n", j);
goto error;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = tmp;
sg->next = prev->next;
cpus_or(covered, covered, tmp);