sched,perf: Fix periodic timers
In the below two commits (see Fixes) we have periodic timers that can
stop themselves when they're no longer required, but need to be
(re)-started when their idle condition changes.
Further complications is that we want the timer handler to always do
the forward such that it will always correctly deal with the overruns,
and we do not want to race such that the handler has already decided
to stop, but the (external) restart sees the timer still active and we
end up with a 'lost' timer.
The problem with the current code is that the re-start can come before
the callback does the forward, at which point the forward from the
callback will WARN about forwarding an enqueued timer.
Now, conceptually its easy to detect if you're before or after the fwd
by comparing the expiration time against the current time. Of course,
that's expensive (and racy) because we don't have the current time.
Alternatively one could cache this state inside the timer, but then
everybody pays the overhead of maintaining this extra state, and that
is undesired.
The only other option that I could see is the external timer_active
variable, which I tried to kill before. I would love a nicer interface
for this seemingly simple 'problem' but alas.
Fixes: 272325c4821f ("perf: Fix mux_interval hrtimer wreckage")
Fixes: 77a4d1a1b9a1 ("sched: Cleanup bandwidth timers")
Cc: pjt@google.com
Cc: tglx@linutronix.de
Cc: klamm@yandex-team.ru
Cc: mingo@kernel.org
Cc: bsegall@google.com
Cc: hpa@zytor.com
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150514102311.GX21418@twins.programming.kicks-ass.net
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61992cf..cf3342a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -566,8 +566,12 @@
struct perf_event_context *task_ctx;
int active_oncpu;
int exclusive;
+
+ raw_spinlock_t hrtimer_lock;
struct hrtimer hrtimer;
ktime_t hrtimer_interval;
+ unsigned int hrtimer_active;
+
struct pmu *unique_pmu;
struct perf_cgroup *cgrp;
};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f528829..d9c93f3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -752,24 +752,21 @@
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
struct perf_cpu_context *cpuctx;
- enum hrtimer_restart ret = HRTIMER_NORESTART;
int rotations = 0;
WARN_ON(!irqs_disabled());
cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-
rotations = perf_rotate_context(cpuctx);
- /*
- * arm timer if needed
- */
- if (rotations) {
+ raw_spin_lock(&cpuctx->hrtimer_lock);
+ if (rotations)
hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
- ret = HRTIMER_RESTART;
- }
+ else
+ cpuctx->hrtimer_active = 0;
+ raw_spin_unlock(&cpuctx->hrtimer_lock);
- return ret;
+ return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
@@ -792,7 +789,8 @@
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ raw_spin_lock_init(&cpuctx->hrtimer_lock);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
timer->function = perf_mux_hrtimer_handler;
}
@@ -800,15 +798,20 @@
{
struct hrtimer *timer = &cpuctx->hrtimer;
struct pmu *pmu = cpuctx->ctx.pmu;
+ unsigned long flags;
/* not for SW PMU */
if (pmu->task_ctx_nr == perf_sw_context)
return 0;
- if (hrtimer_is_queued(timer))
- return 0;
+ raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+ if (!cpuctx->hrtimer_active) {
+ cpuctx->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ }
+ raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
- hrtimer_start(timer, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED);
return 0;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8a6196..e84aeb2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,18 +90,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
- /*
- * Do not forward the expiration time of active timers;
- * we do not want to loose an overrun.
- */
- if (!hrtimer_active(period_timer))
- hrtimer_forward_now(period_timer, period);
-
- hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
-}
-
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e3b32eb..69be282 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3870,8 +3870,9 @@
if (runtime_refresh_within(cfs_b, min_left))
return;
- start_bandwidth_timer(&cfs_b->slack_timer,
- ns_to_ktime(cfs_bandwidth_slack_period));
+ hrtimer_start(&cfs_b->slack_timer,
+ ns_to_ktime(cfs_bandwidth_slack_period),
+ HRTIMER_MODE_REL);
}
/* we know any runtime found here is valid as update_curr() precedes return */
@@ -4012,6 +4013,8 @@
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
+ if (idle)
+ cfs_b->period_active = 0;
raw_spin_unlock(&cfs_b->lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@ -4025,7 +4028,7 @@
cfs_b->period = ns_to_ktime(default_cfs_period());
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
- hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
@@ -4039,7 +4042,13 @@
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
- start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+ lockdep_assert_held(&cfs_b->lock);
+
+ if (!cfs_b->period_active) {
+ cfs_b->period_active = 1;
+ hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+ hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+ }
}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b0febf2..e43da53 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -31,6 +31,8 @@
idle = do_sched_rt_period_timer(rt_b, overrun);
raw_spin_lock(&rt_b->rt_runtime_lock);
}
+ if (idle)
+ rt_b->rt_period_active = 0;
raw_spin_unlock(&rt_b->rt_runtime_lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@ -54,7 +56,11 @@
return;
raw_spin_lock(&rt_b->rt_runtime_lock);
- start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+ if (!rt_b->rt_period_active) {
+ rt_b->rt_period_active = 1;
+ hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+ hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+ }
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 08606a1..f9a58ef 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -131,6 +131,7 @@
ktime_t rt_period;
u64 rt_runtime;
struct hrtimer rt_period_timer;
+ unsigned int rt_period_active;
};
void __dl_clear_params(struct task_struct *p);
@@ -215,7 +216,7 @@
s64 hierarchical_quota;
u64 runtime_expires;
- int idle;
+ int idle, period_active;
struct hrtimer period_timer, slack_timer;
struct list_head throttled_cfs_rq;
@@ -1406,8 +1407,6 @@
static inline void sched_avg_update(struct rq *rq) { }
#endif
-extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/