sched/numa: Avoid migrating tasks that are placed on their preferred node
This patch classifies scheduler domains and runqueues into types depending
the number of tasks that are about their NUMA placement and the number
that are currently running on their preferred node. The types are
regular: There are tasks running that do not care about their NUMA
placement.
remote: There are tasks running that care about their placement but are
currently running on a node remote to their ideal placement
all: No distinction
To implement this the patch tracks the number of tasks that are optimally
NUMA placed (rq->nr_preferred_running) and the number of tasks running
that care about their placement (nr_numa_running). The load balancer
uses this information to avoid migrating idea placed NUMA tasks as long
as better options for load balancing exists. For example, it will not
consider balancing between a group whose tasks are all perfectly placed
and a group with remote tasks.
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1381141781-10992-56-git-send-email-mgorman@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 423316c..5166b9b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -888,6 +888,18 @@
*/
unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running += (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+ rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+ rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
struct numa_group {
atomic_t refcount;
@@ -1227,6 +1239,8 @@
if (env.best_cpu == -1)
return -EAGAIN;
+ sched_setnuma(p, env.dst_nid);
+
if (env.best_task == NULL) {
int ret = migrate_task_to(p, env.best_cpu);
return ret;
@@ -1342,8 +1356,7 @@
/* Preferred node as the node with the most faults */
if (max_faults && max_nid != p->numa_preferred_nid) {
/* Update the preferred nid and migrate task if possible */
- p->numa_preferred_nid = max_nid;
- p->numa_migrate_seq = 1;
+ sched_setnuma(p, max_nid);
numa_migrate_preferred(p);
}
}
@@ -1741,6 +1754,14 @@
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
+
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -1750,8 +1771,12 @@
if (!parent_entity(se))
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
- if (entity_is_task(se))
- list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+ if (entity_is_task(se)) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ account_numa_enqueue(rq, task_of(se));
+ list_add(&se->group_node, &rq->cfs_tasks);
+ }
#endif
cfs_rq->nr_running++;
}
@@ -1762,8 +1787,10 @@
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
- if (entity_is_task(se))
+ if (entity_is_task(se)) {
+ account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
+ }
cfs_rq->nr_running--;
}
@@ -4605,6 +4632,8 @@
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+enum fbq_type { regular, remote, all };
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
@@ -4631,6 +4660,8 @@
unsigned int loop;
unsigned int loop_break;
unsigned int loop_max;
+
+ enum fbq_type fbq_type;
};
/*
@@ -5092,6 +5123,10 @@
unsigned int group_weight;
int group_imb; /* Is there an imbalance in the group ? */
int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned int nr_numa_running;
+ unsigned int nr_preferred_running;
+#endif
};
/*
@@ -5409,6 +5444,10 @@
sgs->group_load += load;
sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
@@ -5474,14 +5513,43 @@
return false;
}
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ if (sgs->sum_nr_running > sgs->nr_numa_running)
+ return regular;
+ if (sgs->sum_nr_running > sgs->nr_preferred_running)
+ return remote;
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ if (rq->nr_running > rq->nr_numa_running)
+ return regular;
+ if (rq->nr_running > rq->nr_preferred_running)
+ return remote;
+ return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+ return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+ return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
-static inline void update_sd_lb_stats(struct lb_env *env,
- struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
@@ -5538,6 +5606,9 @@
sg = sg->next;
} while (sg != env->sd->groups);
+
+ if (env->sd->flags & SD_NUMA)
+ env->fbq_type = fbq_classify_group(&sds->busiest_stat);
}
/**
@@ -5841,15 +5912,39 @@
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long power = power_of(i);
- unsigned long capacity = DIV_ROUND_CLOSEST(power,
- SCHED_POWER_SCALE);
- unsigned long wl;
+ unsigned long power, capacity, wl;
+ enum fbq_type rt;
+ rq = cpu_rq(i);
+ rt = fbq_classify_rq(rq);
+
+ /*
+ * We classify groups/runqueues into three groups:
+ * - regular: there are !numa tasks
+ * - remote: there are numa tasks that run on the 'wrong' node
+ * - all: there is no distinction
+ *
+ * In order to avoid migrating ideally placed numa tasks,
+ * ignore those when there's better options.
+ *
+ * If we ignore the actual busiest queue to migrate another
+ * task, the next balance pass can still reduce the busiest
+ * queue by moving tasks around inside the node.
+ *
+ * If we cannot move enough load due to this classification
+ * the next pass will adjust the group classification and
+ * allow migration of more tasks.
+ *
+ * Both cases only affect the total convergence complexity.
+ */
+ if (rt > env->fbq_type)
+ continue;
+
+ power = power_of(i);
+ capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
if (!capacity)
capacity = fix_small_capacity(env->sd, group);
- rq = cpu_rq(i);
wl = weighted_cpuload(i);
/*
@@ -5966,6 +6061,7 @@
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
+ .fbq_type = all,
};
/*