mirror of https://github.com/torvalds/linux.git
sched/deadline: Fix dl_server getting stuck
John found it was easy to hit lockup warnings when running locktorture on a 2 CPU VM, which he bisected down to: commitcccb45d7c4("sched/deadline: Less agressive dl_server handling"). While debugging it seems there is a chance where we end up with the dl_server dequeued, with dl_se->dl_server_active. This causes dl_server_start() to return without enqueueing the dl_server, thus it fails to run when RT tasks starve the cpu. When this happens, dl_server_timer() catches the '!dl_se->server_has_tasks(dl_se)' case, which then calls replenish_dl_entity() and dl_server_stopped() and finally return HRTIMER_NO_RESTART. This ends in no new timer and also no enqueue, leaving the dl_server 'dead', allowing starvation. What should have happened is for the bandwidth timer to start the zero-laxity timer, which in turn would enqueue the dl_server and cause dl_se->server_pick_task() to be called -- which will stop the dl_server if no fair tasks are observed for a whole period. IOW, it is totally irrelevant if there are fair tasks at the moment of bandwidth refresh. This removes all dl_se->server_has_tasks() users, so remove the whole thing. Fixes:cccb45d7c4("sched/deadline: Less agressive dl_server handling") Reported-by: John Stultz <jstultz@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: John Stultz <jstultz@google.com>
This commit is contained in:
parent
f83ec76bf2
commit
4ae8d9aa9f
|
|
@ -733,7 +733,6 @@ struct sched_dl_entity {
|
|||
* runnable task.
|
||||
*/
|
||||
struct rq *rq;
|
||||
dl_server_has_tasks_f server_has_tasks;
|
||||
dl_server_pick_f server_pick_task;
|
||||
|
||||
#ifdef CONFIG_RT_MUTEXES
|
||||
|
|
|
|||
|
|
@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se)
|
|||
*/
|
||||
if (dl_se->dl_defer && !dl_se->dl_defer_running &&
|
||||
dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) {
|
||||
if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) {
|
||||
if (!is_dl_boosted(dl_se)) {
|
||||
|
||||
/*
|
||||
* Set dl_se->dl_defer_armed and dl_throttled variables to
|
||||
|
|
@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf)
|
|||
/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */
|
||||
static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC;
|
||||
|
||||
static bool dl_server_stopped(struct sched_dl_entity *dl_se);
|
||||
|
||||
static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se)
|
||||
{
|
||||
struct rq *rq = rq_of_dl_se(dl_se);
|
||||
|
|
@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
|
|||
if (!dl_se->dl_runtime)
|
||||
return HRTIMER_NORESTART;
|
||||
|
||||
if (!dl_se->server_has_tasks(dl_se)) {
|
||||
replenish_dl_entity(dl_se);
|
||||
dl_server_stopped(dl_se);
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
if (dl_se->dl_defer_armed) {
|
||||
/*
|
||||
* First check if the server could consume runtime in background.
|
||||
|
|
@ -1625,11 +1617,9 @@ static bool dl_server_stopped(struct sched_dl_entity *dl_se)
|
|||
}
|
||||
|
||||
void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
|
||||
dl_server_has_tasks_f has_tasks,
|
||||
dl_server_pick_f pick_task)
|
||||
{
|
||||
dl_se->rq = rq;
|
||||
dl_se->server_has_tasks = has_tasks;
|
||||
dl_se->server_pick_task = pick_task;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -8859,11 +8859,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru
|
|||
return pick_next_task_fair(rq, prev, NULL);
|
||||
}
|
||||
|
||||
static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
|
||||
{
|
||||
return !!dl_se->rq->cfs.nr_queued;
|
||||
}
|
||||
|
||||
static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
|
||||
{
|
||||
return pick_task_fair(dl_se->rq);
|
||||
|
|
@ -8875,7 +8870,7 @@ void fair_server_init(struct rq *rq)
|
|||
|
||||
init_dl_entity(dl_se);
|
||||
|
||||
dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task);
|
||||
dl_server_init(dl_se, rq, fair_server_pick_task);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -365,9 +365,6 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6
|
|||
*
|
||||
* dl_se::rq -- runqueue we belong to.
|
||||
*
|
||||
* dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the
|
||||
* server when it runs out of tasks to run.
|
||||
*
|
||||
* dl_se::server_pick() -- nested pick_next_task(); we yield the period if this
|
||||
* returns NULL.
|
||||
*
|
||||
|
|
@ -383,7 +380,6 @@ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec);
|
|||
extern void dl_server_start(struct sched_dl_entity *dl_se);
|
||||
extern void dl_server_stop(struct sched_dl_entity *dl_se);
|
||||
extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
|
||||
dl_server_has_tasks_f has_tasks,
|
||||
dl_server_pick_f pick_task);
|
||||
extern void sched_init_dl_servers(void);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue