From 3e816361e94a0e79b1aabf44abec552e9698b196 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 2 Apr 2025 11:09:25 -0700 Subject: [PATCH 01/26] sched/tracepoints: Move and extend the sched_process_exit() tracepoint It is useful to be able to access current->mm at task exit to, say, record a bunch of VMA information right before the task exits (e.g., for stack symbolization reasons when dealing with short-lived processes that exit in the middle of profiling session). Currently, trace_sched_process_exit() is triggered after exit_mm() which resets current->mm to NULL making this tracepoint unsuitable for inspecting and recording task's mm_struct-related data when tracing process lifetimes. There is a particularly suitable place, though, right after taskstats_exit() is called, but before we do exit_mm() and other exit_*() resource teardowns. taskstats performs a similar kind of accounting that some applications do with BPF, and so co-locating them seems like a good fit. So that's where trace_sched_process_exit() is moved with this patch. Also, existing trace_sched_process_exit() tracepoint is notoriously missing `group_dead` flag that is certainly useful in practice and some of our production applications have to work around this. So plumb `group_dead` through while at it, to have a richer and more complete tracepoint. Note that we can't use sched_process_template anymore, and so we use TRACE_EVENT()-based tracepoint definition. But all the field names and order, as well as assign and output logic remain intact. We just add one extra field at the end in backwards-compatible way. Document the dependency to sched_process_template anyway. Signed-off-by: Andrii Nakryiko Signed-off-by: Ingo Molnar Acked-by: Steven Rostedt (Google) Acked-by: Oleg Nesterov Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250402180925.90914-1-andrii@kernel.org --- include/trace/events/sched.h | 34 ++++++++++++++++++++++++++++++---- kernel/exit.c | 2 +- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8994e97d86c1..3bec9fb73a36 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -326,11 +326,37 @@ DEFINE_EVENT(sched_process_template, sched_process_free, TP_ARGS(p)); /* - * Tracepoint for a task exiting: + * Tracepoint for a task exiting. + * Note, it's a superset of sched_process_template and should be kept + * compatible as much as possible. sched_process_exits has an extra + * `group_dead` argument, so sched_process_template can't be used, + * unfortunately, just like sched_migrate_task above. */ -DEFINE_EVENT(sched_process_template, sched_process_exit, - TP_PROTO(struct task_struct *p), - TP_ARGS(p)); +TRACE_EVENT(sched_process_exit, + + TP_PROTO(struct task_struct *p, bool group_dead), + + TP_ARGS(p, group_dead), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( bool, group_dead ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ + __entry->group_dead = group_dead; + ), + + TP_printk("comm=%s pid=%d prio=%d group_dead=%s", + __entry->comm, __entry->pid, __entry->prio, + __entry->group_dead ? "true" : "false" + ) +); /* * Tracepoint for waiting on task to unschedule: diff --git a/kernel/exit.c b/kernel/exit.c index 1b51dc099f1e..f1db86dcbeb1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -936,12 +936,12 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + trace_sched_process_exit(tsk, group_dead); exit_mm(); if (group_dead) acct_process(); - trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); From 8feb053d53194382fcfb68231296fdc220497ea6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Mar 2025 22:23:23 +0100 Subject: [PATCH 02/26] sched: Fix trace_sched_switch(.prev_state) Gabriele noted that in case of signal_pending_state(), the tracepoint sees a stale task-state. Fixes: fa2c3254d7cf ("sched/tracing: Don't re-read p->state when emitting sched_switch event") Reported-by: Gabriele Monaco Signed-off-by: Peter Zijlstra (Intel) Cc: Valentin Schneider --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cfaca3040b2f..042c978f7c9b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6571,12 +6571,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * Otherwise marks the task's __state as RUNNING */ static bool try_to_block_task(struct rq *rq, struct task_struct *p, - unsigned long task_state) + unsigned long *task_state_p) { + unsigned long task_state = *task_state_p; int flags = DEQUEUE_NOCLOCK; if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); + *task_state_p = TASK_RUNNING; return false; } @@ -6713,7 +6715,7 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - try_to_block_task(rq, prev, prev_state); + try_to_block_task(rq, prev, &prev_state); switch_count = &prev->nvcsw; } From f55dac1dafb3334be1d5b54bf385e8cfaa0ab3b3 Mon Sep 17 00:00:00 2001 From: Steve Wahl Date: Tue, 4 Mar 2025 10:08:43 -0600 Subject: [PATCH 03/26] sched/topology: improve topology_span_sane speed Use a different approach to topology_span_sane(), that checks for the same constraint of no partial overlaps for any two CPU sets for non-NUMA topology levels, but does so in a way that is O(N) rather than O(N^2). Instead of comparing with all other masks to detect collisions, keep one mask that includes all CPUs seen so far and detect collisions with a single cpumask_intersects test. If the current mask has no collisions with previously seen masks, it should be a new mask, which can be uniquely identified by the lowest bit set in this mask. Keep a pointer to this mask for future reference (in an array indexed by the lowest bit set), and add the CPUs in this mask to the list of those seen. If the current mask does collide with previously seen masks, it should be exactly equal to a mask seen before, looked up in the same array indexed by the lowest bit set in the mask, a single comparison. Move the topology_span_sane() check out of the existing topology level loop, let it use its own loop so that the array allocation can be done only once, shared across levels. On a system with 1920 processors (16 sockets, 60 cores, 2 threads), the average time to take one processor offline is reduced from 2.18 seconds to 1.01 seconds. (Off-lining 959 of 1920 processors took 34m49.765s without this change, 16m10.038s with this change in place.) Signed-off-by: Steve Wahl Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Madadi Vineeth Reddy Tested-by: K Prateek Nayak Tested-by: Valentin Schneider Tested-by: Madadi Vineeth Reddy Link: https://lore.kernel.org/r/20250304160844.75373-2-steve.wahl@hpe.com --- kernel/sched/topology.c | 85 ++++++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 26 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f1ebc60d967f..439e6ce9900b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2347,36 +2347,69 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve /* * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for - * any two given CPUs at this (non-NUMA) topology level. + * any two given CPUs on non-NUMA topology levels. */ -static bool topology_span_sane(struct sched_domain_topology_level *tl, - const struct cpumask *cpu_map, int cpu) +static bool topology_span_sane(const struct cpumask *cpu_map) { - int i = cpu + 1; + struct sched_domain_topology_level *tl; + const struct cpumask **masks; + struct cpumask *covered; + int cpu, id; + bool ret = false; - /* NUMA levels are allowed to overlap */ - if (tl->flags & SDTL_OVERLAP) - return true; + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; + + masks = kmalloc_array(nr_cpu_ids, sizeof(struct cpumask *), GFP_KERNEL); + if (!masks) + return ret; + + for_each_sd_topology(tl) { + + /* NUMA levels are allowed to overlap */ + if (tl->flags & SDTL_OVERLAP) + continue; + + cpumask_clear(covered); + memset(masks, 0, nr_cpu_ids * sizeof(struct cpumask *)); - /* - * Non-NUMA levels cannot partially overlap - they must be either - * completely equal or completely disjoint. Otherwise we can end up - * breaking the sched_group lists - i.e. a later get_group() pass - * breaks the linking done for an earlier span. - */ - for_each_cpu_from(i, cpu_map) { /* - * We should 'and' all those masks with 'cpu_map' to exactly - * match the topology we're about to build, but that can only - * remove CPUs, which only lessens our ability to detect - * overlaps + * Non-NUMA levels cannot partially overlap - they must be either + * completely equal or completely disjoint. Otherwise we can end up + * breaking the sched_group lists - i.e. a later get_group() pass + * breaks the linking done for an earlier span. */ - if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && - cpumask_intersects(tl->mask(cpu), tl->mask(i))) - return false; - } + for_each_cpu(cpu, cpu_map) { + /* lowest bit set in this mask is used as a unique id */ + id = cpumask_first(tl->mask(cpu)); - return true; + /* zeroed masks cannot possibly collide */ + if (id >= nr_cpu_ids) + continue; + + /* if this mask doesn't collide with what we've already seen */ + if (!cpumask_intersects(tl->mask(cpu), covered)) { + /* this failing would be an error in this algorithm */ + if (WARN_ON(masks[id])) + goto notsane; + + /* record the mask we saw for this id */ + masks[id] = tl->mask(cpu); + cpumask_or(covered, tl->mask(cpu), covered); + } else if ((!masks[id]) || !cpumask_equal(masks[id], tl->mask(cpu))) { + /* + * a collision with covered should have exactly matched + * a previously seen mask with the same id + */ + goto notsane; + } + } + } + ret = true; + + notsane: + kfree(masks); + return ret; } /* @@ -2408,9 +2441,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att sd = NULL; for_each_sd_topology(tl) { - if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) - goto error; - sd = build_sched_domain(tl, cpu_map, attr, sd, i); has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; @@ -2424,6 +2454,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } + if (WARN_ON(!topology_span_sane(cpu_map))) + goto error; + /* Build the groups for the domains */ for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { From ce29a7da84cdeafc7c08c32d329037c71ab3f3dd Mon Sep 17 00:00:00 2001 From: Steve Wahl Date: Tue, 4 Mar 2025 10:08:44 -0600 Subject: [PATCH 04/26] sched/topology: Refinement to topology_span_sane speedup Simplify the topology_span_sane code further, removing the need to allocate an array and gotos used to make sure the array gets freed. This version is in a separate commit because it could return a different sanity result than the previous code, but only in odd circumstances that are not expected to actually occur; for example, when a CPU is not listed in its own mask. Signed-off-by: Steve Wahl Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Madadi Vineeth Reddy Tested-by: K Prateek Nayak Tested-by: Valentin Schneider Tested-by: Madadi Vineeth Reddy Link: https://lore.kernel.org/r/20250304160844.75373-3-steve.wahl@hpe.com --- kernel/sched/topology.c | 52 +++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 439e6ce9900b..b334f254f5e3 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2352,17 +2352,12 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve static bool topology_span_sane(const struct cpumask *cpu_map) { struct sched_domain_topology_level *tl; - const struct cpumask **masks; - struct cpumask *covered; - int cpu, id; - bool ret = false; + struct cpumask *covered, *id_seen; + int cpu; lockdep_assert_held(&sched_domains_mutex); covered = sched_domains_tmpmask; - - masks = kmalloc_array(nr_cpu_ids, sizeof(struct cpumask *), GFP_KERNEL); - if (!masks) - return ret; + id_seen = sched_domains_tmpmask2; for_each_sd_topology(tl) { @@ -2371,7 +2366,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) continue; cpumask_clear(covered); - memset(masks, 0, nr_cpu_ids * sizeof(struct cpumask *)); + cpumask_clear(id_seen); /* * Non-NUMA levels cannot partially overlap - they must be either @@ -2380,36 +2375,27 @@ static bool topology_span_sane(const struct cpumask *cpu_map) * breaks the linking done for an earlier span. */ for_each_cpu(cpu, cpu_map) { + const struct cpumask *tl_cpu_mask = tl->mask(cpu); + int id; + /* lowest bit set in this mask is used as a unique id */ - id = cpumask_first(tl->mask(cpu)); + id = cpumask_first(tl_cpu_mask); - /* zeroed masks cannot possibly collide */ - if (id >= nr_cpu_ids) - continue; + if (cpumask_test_cpu(id, id_seen)) { + /* First CPU has already been seen, ensure identical spans */ + if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) + return false; + } else { + /* First CPU hasn't been seen before, ensure it's a completely new span */ + if (cpumask_intersects(tl_cpu_mask, covered)) + return false; - /* if this mask doesn't collide with what we've already seen */ - if (!cpumask_intersects(tl->mask(cpu), covered)) { - /* this failing would be an error in this algorithm */ - if (WARN_ON(masks[id])) - goto notsane; - - /* record the mask we saw for this id */ - masks[id] = tl->mask(cpu); - cpumask_or(covered, tl->mask(cpu), covered); - } else if ((!masks[id]) || !cpumask_equal(masks[id], tl->mask(cpu))) { - /* - * a collision with covered should have exactly matched - * a previously seen mask with the same id - */ - goto notsane; + cpumask_or(covered, covered, tl_cpu_mask); + cpumask_set_cpu(id, id_seen); } } } - ret = true; - - notsane: - kfree(masks); - return ret; + return true; } /* From f2d650618bc721760199ae0133c73ec32c63817e Mon Sep 17 00:00:00 2001 From: Pierre Gondois Date: Tue, 25 Mar 2025 16:05:41 +0100 Subject: [PATCH 05/26] sched/fair: Allow decaying util_est when util_avg > CPU capa commit 10a35e6812aa ("sched/pelt: Skip updating util_est when utilization is higher than CPU's capacity") prevents util_est from being updated if util_avg is higher than the underlying CPU capacity to avoid overestimating the task when the CPU is capped (due to thermal issue for instance). In this scenario, the task will miss its deadlines and start overlapping its wake-up events for instance. The task will appear as always running when the CPU is just not powerful enough to allow having a good estimation of the task. commit b8c96361402a ("sched/fair/util_est: Implement faster ramp-up EWMA on utilization increases") sets ewma to util_avg when ewma > util_avg, allowing ewma to quickly grow instead of slowly converge to the new util_avg value when a task profile changes from small to big. However, the 2 conditions: - Check util_avg against max CPU capacity - Check whether util_est > util_avg are placed in an order such as it is possible to set util_est to a value higher than the CPU capacity if util_est > util_avg, but util_est is prevented to decay as long as: CPU capacity < util_avg < util_est. Just remove the check as either: 1. There is idle time on the CPU. In that case the util_avg value of the task is actually correct. It is possible that the task missed a deadline and appears bigger, but this is also the case when the util_avg of the task is lower than the maximum CPU capacity. 2. There is no idle time. In that case, the util_avg value might aswell be an under estimation of the size of the task. It is possible that undesired frequency spikes will appear when the task is later enqueued with an inflated util_est value, but the frequency spike might aswell be deserved. The absence of idle time prevents from drawing any conclusion. Signed-off-by: Pierre Gondois Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20250325150542.1077344-1-pierre.gondois@arm.com --- kernel/sched/fair.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e43993a4e580..0c19459c8042 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4932,13 +4932,6 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, if (last_ewma_diff < UTIL_EST_MARGIN) goto done; - /* - * To avoid overestimation of actual task utilization, skip updates if - * we cannot grant there is idle time in this CPU. - */ - if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) - return; - /* * To avoid underestimate of task utilization, skip updates of EWMA if * we cannot grant that thread got all CPU time it wanted. From 433bce5dadb4ec3d5eda99c5125926c045b79005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 10 Mar 2025 18:04:33 +0100 Subject: [PATCH 06/26] sched: Convert CONFIG_RT_GROUP_SCHED macros to code conditions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert the blocks guarded by macros to regular code so that the RT group code gets more compile validation. Reasoning is in Documentation/process/coding-style.rst 21) Conditional Compilation. With that, no functional change is expected. Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310170442.504716-2-mkoutny@suse.com --- kernel/sched/rt.c | 10 ++++------ kernel/sched/syscalls.c | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index fa03ec3ed56a..2ade81e69db0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1066,13 +1066,12 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); -#ifdef CONFIG_RT_GROUP_SCHED /* * Change rq's cpupri only if rt_rq is the top queue. */ - if (&rq->rt != rt_rq) + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) return; -#endif + if (rq->online && prio < prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, prio); } @@ -1082,13 +1081,12 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); -#ifdef CONFIG_RT_GROUP_SCHED /* * Change rq's cpupri only if rt_rq is the top queue. */ - if (&rq->rt != rt_rq) + if (IS_ENABLED(CONFIG_RT_GROUP_SCHED) && &rq->rt != rt_rq) return; -#endif + if (rq->online && rt_rq->highest_prio.curr != prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index c326de1344fb..2bf528116fad 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -640,7 +640,7 @@ int __sched_setscheduler(struct task_struct *p, retval = -EPERM; goto unlock; } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP if (dl_bandwidth_enabled() && dl_policy(policy) && !(attr->sched_flags & SCHED_FLAG_SUGOV)) { From e285313f0848157cc3c6827d233a2510167b50cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 10 Mar 2025 18:04:34 +0100 Subject: [PATCH 07/26] sched: Remove unneeed macro wrap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rt_entity_is_task has split definitions based on CONFIG_RT_GROUP_SCHED, therefore we can use it always. No functional change intended. Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310170442.504716-3-mkoutny@suse.com --- kernel/sched/rt.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2ade81e69db0..61ec29b11ef4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1255,11 +1255,9 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr static inline struct sched_statistics * __schedstats_from_rt_se(struct sched_rt_entity *rt_se) { -#ifdef CONFIG_RT_GROUP_SCHED /* schedstats is not supported for rt group. */ if (!rt_entity_is_task(rt_se)) return NULL; -#endif return &rt_task_of(rt_se)->stats; } From a5a25b32c08a31c03258ec4960bec26caaf76e9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 10 Mar 2025 18:04:35 +0100 Subject: [PATCH 08/26] sched: Always initialize rt_rq's task_group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rt_rq->tg may be NULL which denotes the root task_group. Store the pointer to root_task_group directly so that callers may use rt_rq->tg homogenously. root_task_group exists always with CONFIG_CGROUPS_SCHED, CONFIG_RT_GROUP_SCHED depends on that. This changes root level rt_rq's default limit from infinity to the value of (originally) global RT throttling. Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310170442.504716-4-mkoutny@suse.com --- kernel/sched/rt.c | 7 ++----- kernel/sched/sched.h | 2 ++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 61ec29b11ef4..1af3996ec0fb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -89,6 +89,7 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; raw_spin_lock_init(&rt_rq->rt_runtime_lock); + rt_rq->tg = &root_task_group; #endif } @@ -482,9 +483,6 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - if (!rt_rq->tg) - return RUNTIME_INF; - return rt_rq->rt_runtime; } @@ -1154,8 +1152,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted++; - if (rt_rq->tg) - start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); } static void diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 47972f34ea70..c006348102d9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -821,6 +821,8 @@ struct rt_rq { unsigned int rt_nr_boosted; struct rq *rq; +#endif +#ifdef CONFIG_CGROUP_SCHED struct task_group *tg; #endif }; From e34e0131fea1b0f63c2105a1958c94af2ee90f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 10 Mar 2025 18:04:36 +0100 Subject: [PATCH 09/26] sched: Add commadline option for RT_GROUP_SCHED toggling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only simple implementation with a static key wrapper, it will be wired in later. Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310170442.504716-5-mkoutny@suse.com --- .../admin-guide/kernel-parameters.txt | 5 ++++ init/Kconfig | 11 ++++++++ kernel/sched/core.c | 25 +++++++++++++++++++ kernel/sched/sched.h | 17 +++++++++++++ 4 files changed, 58 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f35d5b8c296..168202330cfa 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6280,6 +6280,11 @@ Memory area to be used by remote processor image, managed by CMA. + rt_group_sched= [KNL] Enable or disable SCHED_RR/FIFO group scheduling + when CONFIG_RT_GROUP_SCHED=y. Defaults to + !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED. + Format: + rw [KNL] Mount root device read-write on boot S [KNL] Run init in single mode diff --git a/init/Kconfig b/init/Kconfig index 681f38ee68db..b2c045c71d7f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1082,6 +1082,17 @@ config RT_GROUP_SCHED realtime bandwidth for them. See Documentation/scheduler/sched-rt-group.rst for more information. +config RT_GROUP_SCHED_DEFAULT_DISABLED + bool "Require boot parameter to enable group scheduling for SCHED_RR/FIFO" + depends on RT_GROUP_SCHED + default n + help + When set, the RT group scheduling is disabled by default. The option + is in inverted form so that mere RT_GROUP_SCHED enables the group + scheduling. + + Say N if unsure. + config EXT_GROUP_SCHED bool depends on SCHED_CLASS_EXT && CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 042c978f7c9b..58d093a8c1af 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9892,6 +9892,31 @@ static struct cftype cpu_legacy_files[] = { { } /* Terminate */ }; +#ifdef CONFIG_RT_GROUP_SCHED +# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED +DEFINE_STATIC_KEY_FALSE(rt_group_sched); +# else +DEFINE_STATIC_KEY_TRUE(rt_group_sched); +# endif + +static int __init setup_rt_group_sched(char *str) +{ + long val; + + if (kstrtol(str, 0, &val) || val < 0 || val > 1) { + pr_warn("Unable to set rt_group_sched\n"); + return 1; + } + if (val) + static_branch_enable(&rt_group_sched); + else + static_branch_disable(&rt_group_sched); + + return 1; +} +__setup("rt_group_sched=", setup_rt_group_sched); +#endif /* CONFIG_RT_GROUP_SCHED */ + static int cpu_extra_stat_show(struct seq_file *sf, struct cgroup_subsys_state *css) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c006348102d9..d1e591f91cf8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1500,6 +1500,23 @@ static inline bool sched_group_cookie_match(struct rq *rq, } #endif /* !CONFIG_SCHED_CORE */ +#ifdef CONFIG_RT_GROUP_SCHED +# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED +DECLARE_STATIC_KEY_FALSE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_unlikely(&rt_group_sched); +} +# else +DECLARE_STATIC_KEY_TRUE(rt_group_sched); +static inline bool rt_group_sched_enabled(void) +{ + return static_branch_likely(&rt_group_sched); +} +# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ +#else +# define rt_group_sched_enabled() false +#endif /* CONFIG_RT_GROUP_SCHED */ static inline void lockdep_assert_rq_held(struct rq *rq) { From 61d3164fec2ed283645dc17fcc51959e8f361e18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 10 Mar 2025 18:04:37 +0100 Subject: [PATCH 10/26] sched: Skip non-root task_groups with disabled RT_GROUP_SCHED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First, we want to prevent placement of RT tasks on non-root rt_rqs which we achieve in the task migration code that'd fall back to root_task_group's rt_rq. Second, we want to work with only root_task_group's rt_rq when iterating all "real" rt_rqs when RT_GROUP is disabled. To achieve this we keep root_task_group as the first one on the task_groups and break out quickly. Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310170442.504716-6-mkoutny@suse.com --- kernel/sched/core.c | 2 +- kernel/sched/rt.c | 9 ++++++--- kernel/sched/sched.h | 7 +++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58d093a8c1af..32fb4c1100cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9020,7 +9020,7 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) unsigned long flags; spin_lock_irqsave(&task_group_lock, flags); - list_add_rcu(&tg->list, &task_groups); + list_add_tail_rcu(&tg->list, &task_groups); /* Root should already exist: */ WARN_ON(!parent); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1af3996ec0fb..efa22bad31e1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -495,6 +495,9 @@ typedef struct task_group *rt_rq_iter_t; static inline struct task_group *next_task_group(struct task_group *tg) { + if (!rt_group_sched_enabled()) + return NULL; + do { tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list); @@ -507,9 +510,9 @@ static inline struct task_group *next_task_group(struct task_group *tg) } #define for_each_rt_rq(rt_rq, iter, rq) \ - for (iter = container_of(&task_groups, typeof(*iter), list); \ - (iter = next_task_group(iter)) && \ - (rt_rq = iter->rt_rq[cpu_of(rq)]);) + for (iter = &root_task_group; \ + iter && (rt_rq = iter->rt_rq[cpu_of(rq)]); \ + iter = next_task_group(iter)) #define for_each_sched_rt_entity(rt_se) \ for (; rt_se; rt_se = rt_se->parent) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d1e591f91cf8..898aab7417bd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2165,6 +2165,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_RT_GROUP_SCHED + /* + * p->rt.rt_rq is NULL initially and it is easier to assign + * root_task_group's rt_rq than switching in rt_rq_of_se() + * Clobbers tg(!) + */ + if (!rt_group_sched_enabled()) + tg = &root_task_group; p->rt.rt_rq = tg->rt_rq[cpu]; p->rt.parent = tg->rt_se[cpu]; #endif From 277e0909754e9f3c82def97150d2f3ea700098f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 10 Mar 2025 18:04:38 +0100 Subject: [PATCH 11/26] sched: Bypass bandwitdh checks with runtime disabled RT_GROUP_SCHED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When RT_GROUPs are compiled but not exposed, their bandwidth cannot be configured (and it is not initialized for non-root task_groups neither). Therefore bypass any checks of task vs task_group bandwidth. This will achieve behavior very similar to setups that have !CONFIG_RT_GROUP_SCHED and attach cpu controller to cgroup v2 hierarchy. (On a related note, this may allow having RT tasks with CONFIG_RT_GROUP_SCHED and cgroup v2 hierarchy.) Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310170442.504716-7-mkoutny@suse.com --- kernel/sched/core.c | 6 +++++- kernel/sched/rt.c | 2 +- kernel/sched/syscalls.c | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 32fb4c1100cb..6900ce5b9039 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9206,11 +9206,15 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) struct task_struct *task; struct cgroup_subsys_state *css; + if (!rt_group_sched_enabled()) + goto scx_check; + cgroup_taskset_for_each(task, css, tset) { if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; } -#endif +scx_check: +#endif /* CONFIG_RT_GROUP_SCHED */ return scx_cgroup_can_attach(tset); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index efa22bad31e1..5e82bfe56fdf 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2864,7 +2864,7 @@ static int sched_rt_global_constraints(void) int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept real-time tasks when there is no way for them to run */ - if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + if (rt_group_sched_enabled() && rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) return 0; return 1; diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 2bf528116fad..547c1f05b667 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -634,7 +634,8 @@ int __sched_setscheduler(struct task_struct *p, * Do not allow real-time tasks into groups that have no runtime * assigned. */ - if (rt_bandwidth_enabled() && rt_policy(policy) && + if (rt_group_sched_enabled() && + rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { retval = -EPERM; From d6809c2f606c14f9e95be87d75a576901d2fa050 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 10 Mar 2025 18:04:39 +0100 Subject: [PATCH 12/26] sched: Do not construct nor expose RT_GROUP_SCHED structures if disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks to kernel cmdline being available early, before any cgroup hierarchy exists, we can achieve the RT_GROUP_SCHED boottime disabling goal by simply skipping any creation (and destruction) of RT_GROUP data and its exposure via RT attributes. We can do this thanks to previously placed runtime guards that would redirect all operations to root_task_group's data when RT_GROUP_SCHED disabled. Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310170442.504716-8-mkoutny@suse.com --- kernel/sched/core.c | 36 ++++++++++++++++++++++++------------ kernel/sched/rt.c | 9 +++++++++ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6900ce5b9039..79692f85643f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9867,18 +9867,6 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_cfs_local_stat_show, }, #endif -#ifdef CONFIG_RT_GROUP_SCHED - { - .name = "rt_runtime_us", - .read_s64 = cpu_rt_runtime_read, - .write_s64 = cpu_rt_runtime_write, - }, - { - .name = "rt_period_us", - .read_u64 = cpu_rt_period_read_uint, - .write_u64 = cpu_rt_period_write_uint, - }, -#endif #ifdef CONFIG_UCLAMP_TASK_GROUP { .name = "uclamp.min", @@ -9897,6 +9885,20 @@ static struct cftype cpu_legacy_files[] = { }; #ifdef CONFIG_RT_GROUP_SCHED +static struct cftype rt_group_files[] = { + { + .name = "rt_runtime_us", + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, + }, + { + .name = "rt_period_us", + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, + }, + { } /* Terminate */ +}; + # ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED DEFINE_STATIC_KEY_FALSE(rt_group_sched); # else @@ -9919,6 +9921,16 @@ static int __init setup_rt_group_sched(char *str) return 1; } __setup("rt_group_sched=", setup_rt_group_sched); + +static int __init cpu_rt_group_init(void) +{ + if (!rt_group_sched_enabled()) + return 0; + + WARN_ON(cgroup_add_legacy_cftypes(&cpu_cgrp_subsys, rt_group_files)); + return 0; +} +subsys_initcall(cpu_rt_group_init); #endif /* CONFIG_RT_GROUP_SCHED */ static int cpu_extra_stat_show(struct seq_file *sf, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5e82bfe56fdf..b6119341f0e2 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -193,6 +193,9 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) void unregister_rt_sched_group(struct task_group *tg) { + if (!rt_group_sched_enabled()) + return; + if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); } @@ -201,6 +204,9 @@ void free_rt_sched_group(struct task_group *tg) { int i; + if (!rt_group_sched_enabled()) + return; + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -245,6 +251,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) struct sched_rt_entity *rt_se; int i; + if (!rt_group_sched_enabled()) + return 1; + tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); if (!tg->rt_rq) goto err; From 87f1fb77d87a6dac9968a321bb10799ae6d2039c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 10 Mar 2025 18:04:40 +0100 Subject: [PATCH 13/26] sched: Add RT_GROUP WARN checks for non-root task_groups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With CONFIG_RT_GROUP_SCHED but runtime disabling of RT_GROUPs we expect the existence of the root task_group only and all rt_sched_entity'ies should be queued on root's rt_rq. If we get a non-root RT_GROUP something went wrong. Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310170442.504716-9-mkoutny@suse.com --- kernel/sched/rt.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index b6119341f0e2..778911bebacb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -176,11 +176,14 @@ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { + /* Cannot fold with non-CONFIG_RT_GROUP_SCHED version, layout */ + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); return rt_rq->rq; } static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) { + WARN_ON(!rt_group_sched_enabled() && rt_se->rt_rq->tg != &root_task_group); return rt_se->rt_rq; } @@ -188,6 +191,7 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_se->rt_rq; + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); return rt_rq->rq; } @@ -504,8 +508,10 @@ typedef struct task_group *rt_rq_iter_t; static inline struct task_group *next_task_group(struct task_group *tg) { - if (!rt_group_sched_enabled()) + if (!rt_group_sched_enabled()) { + WARN_ON(tg != &root_task_group); return NULL; + } do { tg = list_entry_rcu(tg->list.next, @@ -2607,8 +2613,9 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) { struct rt_rq *rt_rq; -#ifdef CONFIG_RT_GROUP_SCHED +#ifdef CONFIG_RT_GROUP_SCHED // XXX maybe add task_rt_rq(), see also sched_rt_period_rt_rq rt_rq = task_group(p)->rt_rq[cpu]; + WARN_ON(!rt_group_sched_enabled() && rt_rq->tg != &root_task_group); #else rt_rq = &cpu_rq(cpu)->rt; #endif @@ -2718,6 +2725,9 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY; + if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group)) + return -EBUSY; + total = to_ratio(period, runtime); /* From 0ab94c3242742bfb540abeedb6bb98440146ac5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 10 Mar 2025 18:04:41 +0100 Subject: [PATCH 14/26] sched: Add annotations to RT_GROUP_SCHED fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update comments to ease RT throttling understanding. Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250310170442.504716-10-mkoutny@suse.com --- kernel/sched/sched.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 898aab7417bd..c5a6a503eb6d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -813,17 +813,17 @@ struct rt_rq { #ifdef CONFIG_RT_GROUP_SCHED int rt_throttled; - u64 rt_time; - u64 rt_runtime; + u64 rt_time; /* consumed RT time, goes up in update_curr_rt */ + u64 rt_runtime; /* allotted RT time, "slice" from rt_bandwidth, RT sharing/balancing */ /* Nests inside the rq lock: */ raw_spinlock_t rt_runtime_lock; unsigned int rt_nr_boosted; - struct rq *rq; + struct rq *rq; /* this is always top-level rq, cache? */ #endif #ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; + struct task_group *tg; /* this tg has "this" rt_rq on given CPU for runnable entities */ #endif }; From 690e47d1403e90b7f2366f03b52ed3304194c793 Mon Sep 17 00:00:00 2001 From: Harshit Agarwal Date: Tue, 25 Feb 2025 18:05:53 +0000 Subject: [PATCH 15/26] sched/rt: Fix race in push_rt_task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Overview ======== When a CPU chooses to call push_rt_task and picks a task to push to another CPU's runqueue then it will call find_lock_lowest_rq method which would take a double lock on both CPUs' runqueues. If one of the locks aren't readily available, it may lead to dropping the current runqueue lock and reacquiring both the locks at once. During this window it is possible that the task is already migrated and is running on some other CPU. These cases are already handled. However, if the task is migrated and has already been executed and another CPU is now trying to wake it up (ttwu) such that it is queued again on the runqeue (on_rq is 1) and also if the task was run by the same CPU, then the current checks will pass even though the task was migrated out and is no longer in the pushable tasks list. Crashes ======= This bug resulted in quite a few flavors of crashes triggering kernel panics with various crash signatures such as assert failures, page faults, null pointer dereferences, and queue corruption errors all coming from scheduler itself. Some of the crashes: -> kernel BUG at kernel/sched/rt.c:1616! BUG_ON(idx >= MAX_RT_PRIO) Call Trace: ? __die_body+0x1a/0x60 ? die+0x2a/0x50 ? do_trap+0x85/0x100 ? pick_next_task_rt+0x6e/0x1d0 ? do_error_trap+0x64/0xa0 ? pick_next_task_rt+0x6e/0x1d0 ? exc_invalid_op+0x4c/0x60 ? pick_next_task_rt+0x6e/0x1d0 ? asm_exc_invalid_op+0x12/0x20 ? pick_next_task_rt+0x6e/0x1d0 __schedule+0x5cb/0x790 ? update_ts_time_stats+0x55/0x70 schedule_idle+0x1e/0x40 do_idle+0x15e/0x200 cpu_startup_entry+0x19/0x20 start_secondary+0x117/0x160 secondary_startup_64_no_verify+0xb0/0xbb -> BUG: kernel NULL pointer dereference, address: 00000000000000c0 Call Trace: ? __die_body+0x1a/0x60 ? no_context+0x183/0x350 ? __warn+0x8a/0xe0 ? exc_page_fault+0x3d6/0x520 ? asm_exc_page_fault+0x1e/0x30 ? pick_next_task_rt+0xb5/0x1d0 ? pick_next_task_rt+0x8c/0x1d0 __schedule+0x583/0x7e0 ? update_ts_time_stats+0x55/0x70 schedule_idle+0x1e/0x40 do_idle+0x15e/0x200 cpu_startup_entry+0x19/0x20 start_secondary+0x117/0x160 secondary_startup_64_no_verify+0xb0/0xbb -> BUG: unable to handle page fault for address: ffff9464daea5900 kernel BUG at kernel/sched/rt.c:1861! BUG_ON(rq->cpu != task_cpu(p)) -> kernel BUG at kernel/sched/rt.c:1055! BUG_ON(!rq->nr_running) Call Trace: ? __die_body+0x1a/0x60 ? die+0x2a/0x50 ? do_trap+0x85/0x100 ? dequeue_top_rt_rq+0xa2/0xb0 ? do_error_trap+0x64/0xa0 ? dequeue_top_rt_rq+0xa2/0xb0 ? exc_invalid_op+0x4c/0x60 ? dequeue_top_rt_rq+0xa2/0xb0 ? asm_exc_invalid_op+0x12/0x20 ? dequeue_top_rt_rq+0xa2/0xb0 dequeue_rt_entity+0x1f/0x70 dequeue_task_rt+0x2d/0x70 __schedule+0x1a8/0x7e0 ? blk_finish_plug+0x25/0x40 schedule+0x3c/0xb0 futex_wait_queue_me+0xb6/0x120 futex_wait+0xd9/0x240 do_futex+0x344/0xa90 ? get_mm_exe_file+0x30/0x60 ? audit_exe_compare+0x58/0x70 ? audit_filter_rules.constprop.26+0x65e/0x1220 __x64_sys_futex+0x148/0x1f0 do_syscall_64+0x30/0x80 entry_SYSCALL_64_after_hwframe+0x62/0xc7 -> BUG: unable to handle page fault for address: ffff8cf3608bc2c0 Call Trace: ? __die_body+0x1a/0x60 ? no_context+0x183/0x350 ? spurious_kernel_fault+0x171/0x1c0 ? exc_page_fault+0x3b6/0x520 ? plist_check_list+0x15/0x40 ? plist_check_list+0x2e/0x40 ? asm_exc_page_fault+0x1e/0x30 ? _cond_resched+0x15/0x30 ? futex_wait_queue_me+0xc8/0x120 ? futex_wait+0xd9/0x240 ? try_to_wake_up+0x1b8/0x490 ? futex_wake+0x78/0x160 ? do_futex+0xcd/0xa90 ? plist_check_list+0x15/0x40 ? plist_check_list+0x2e/0x40 ? plist_del+0x6a/0xd0 ? plist_check_list+0x15/0x40 ? plist_check_list+0x2e/0x40 ? dequeue_pushable_task+0x20/0x70 ? __schedule+0x382/0x7e0 ? asm_sysvec_reschedule_ipi+0xa/0x20 ? schedule+0x3c/0xb0 ? exit_to_user_mode_prepare+0x9e/0x150 ? irqentry_exit_to_user_mode+0x5/0x30 ? asm_sysvec_reschedule_ipi+0x12/0x20 Above are some of the common examples of the crashes that were observed due to this issue. Details ======= Let's look at the following scenario to understand this race. 1) CPU A enters push_rt_task a) CPU A has chosen next_task = task p. b) CPU A calls find_lock_lowest_rq(Task p, CPU Z’s rq). c) CPU A identifies CPU X as a destination CPU (X < Z). d) CPU A enters double_lock_balance(CPU Z’s rq, CPU X’s rq). e) Since X is lower than Z, CPU A unlocks CPU Z’s rq. Someone else has locked CPU X’s rq, and thus, CPU A must wait. 2) At CPU Z a) Previous task has completed execution and thus, CPU Z enters schedule, locks its own rq after CPU A releases it. b) CPU Z dequeues previous task and begins executing task p. c) CPU Z unlocks its rq. d) Task p yields the CPU (ex. by doing IO or waiting to acquire a lock) which triggers the schedule function on CPU Z. e) CPU Z enters schedule again, locks its own rq, and dequeues task p. f) As part of dequeue, it sets p.on_rq = 0 and unlocks its rq. 3) At CPU B a) CPU B enters try_to_wake_up with input task p. b) Since CPU Z dequeued task p, p.on_rq = 0, and CPU B updates B.state = WAKING. c) CPU B via select_task_rq determines CPU Y as the target CPU. 4) The race a) CPU A acquires CPU X’s lock and relocks CPU Z. b) CPU A reads task p.cpu = Z and incorrectly concludes task p is still on CPU Z. c) CPU A failed to notice task p had been dequeued from CPU Z while CPU A was waiting for locks in double_lock_balance. If CPU A knew that task p had been dequeued, it would return NULL forcing push_rt_task to give up the task p's migration. d) CPU B updates task p.cpu = Y and calls ttwu_queue. e) CPU B locks Ys rq. CPU B enqueues task p onto Y and sets task p.on_rq = 1. f) CPU B unlocks CPU Y, triggering memory synchronization. g) CPU A reads task p.on_rq = 1, cementing its assumption that task p has not migrated. h) CPU A decides to migrate p to CPU X. This leads to A dequeuing p from Y's queue and various crashes down the line. Solution ======== The solution here is fairly simple. After obtaining the lock (at 4a), the check is enhanced to make sure that the task is still at the head of the pushable tasks list. If not, then it is anyway not suitable for being pushed out. Testing ======= The fix is tested on a cluster of 3 nodes, where the panics due to this are hit every couple of days. A fix similar to this was deployed on such cluster and was stable for more than 30 days. Co-developed-by: Jon Kohler Signed-off-by: Jon Kohler Co-developed-by: Gauri Patwardhan Signed-off-by: Gauri Patwardhan Co-developed-by: Rahul Chunduru Signed-off-by: Rahul Chunduru Signed-off-by: Harshit Agarwal Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: "Steven Rostedt (Google)" Reviewed-by: Phil Auld Tested-by: Will Ton Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250225180553.167995-1-harshit@nutanix.com --- kernel/sched/rt.c | 54 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 778911bebacb..e40422c37033 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1894,6 +1894,27 @@ static int find_lowest_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(task_current_donor(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!task_on_rq_queued(p)); + BUG_ON(!rt_task(p)); + + return p; +} + /* Will lock the rq it finds */ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) { @@ -1924,18 +1945,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) /* * We had to unlock the run queue. In * the mean time, task could have - * migrated already or had its affinity changed. - * Also make sure that it wasn't scheduled on its rq. + * migrated already or had its affinity changed, + * therefore check if the task is still at the + * head of the pushable tasks list. * It is possible the task was scheduled, set * "migrate_disabled" and then got preempted, so we must * check the task migration disable flag here too. */ - if (unlikely(task_rq(task) != rq || + if (unlikely(is_migration_disabled(task) || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || - task_on_cpu(rq, task) || - !rt_task(task) || - is_migration_disabled(task) || - !task_on_rq_queued(task))) { + task != pick_next_pushable_task(rq))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1955,27 +1974,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static struct task_struct *pick_next_pushable_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_tasks(rq)) - return NULL; - - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(task_current_donor(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); - - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!rt_task(p)); - - return p; -} - /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task From 6432e163ba1b7d80b5876792ce53e511f041ab91 Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Tue, 18 Feb 2025 18:46:18 +0000 Subject: [PATCH 16/26] sched/isolation: Make use of more than one housekeeping cpu The exising code uses housekeeping_any_cpu() to select a cpu for a given housekeeping task. However, this often ends up calling cpumask_any_and() which is defined as cpumask_first_and() which has the effect of alyways using the first cpu among those available. The same applies when multiple NUMA nodes are involved. In that case the first cpu in the local node is chosen which does provide a bit of spreading but with multiple HK cpus per node the same issues arise. We have numerous cases where a single HK cpu just cannot keep up and the remote_tick warning fires. It also can lead to the other things (orchastration sw, HA keepalives etc) on the HK cpus getting starved which leads to other issues. In these cases we recommend increasing the number of HK cpus. But... that only helps the userspace tasks somewhat. It does not help the actual housekeeping part. Spread the HK work out by having housekeeping_any_cpu() and sched_numa_find_closest() use cpumask_any_and_distribute() instead of cpumask_any_and(). Signed-off-by: Phil Auld Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Reviewed-by: Vishal Chourasia Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20250218184618.1331715-1-pauld@redhat.com --- kernel/sched/isolation.c | 2 +- kernel/sched/topology.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 81bc8b329ef1..93b038d48900 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -40,7 +40,7 @@ int housekeeping_any_cpu(enum hk_type type) if (cpu < nr_cpu_ids) return cpu; - cpu = cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask); + cpu = cpumask_any_and_distribute(housekeeping.cpumasks[type], cpu_online_mask); if (likely(cpu < nr_cpu_ids)) return cpu; /* diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b334f254f5e3..bbc2fc2c7c22 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2098,7 +2098,7 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) for (i = 0; i < sched_domains_numa_levels; i++) { if (!masks[i][j]) break; - cpu = cpumask_any_and(cpus, masks[i][j]); + cpu = cpumask_any_and_distribute(cpus, masks[i][j]); if (cpu < nr_cpu_ids) { found = cpu; break; From 872aa4de18889be63317a8c0f2de71a3a01e487c Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 9 Apr 2025 05:34:43 +0000 Subject: [PATCH 17/26] sched/fair: Use READ_ONCE() to read sg->asym_prefer_cpu Subsequent commits add the support to dynamically update the sched_group struct's "asym_prefer_cpu" member from a remote CPU. Use READ_ONCE() when reading the "sg->asym_prefer_cpu" to ensure load balancer always reads the latest value. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250409053446.23367-2-kprateek.nayak@amd.com --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0c19459c8042..5e1bd9e8464c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10251,7 +10251,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group (sgs->group_weight - sgs->idle_cpus != 1)) return false; - return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); + return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu)); } /* One group has more than one SMT CPU while the other group does not */ @@ -10488,7 +10488,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ - return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); + return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu), + READ_ONCE(sg->asym_prefer_cpu)); case group_misfit_task: /* From 0e3f6c3696424fa90d6f512779d617a05a1cf031 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 9 Apr 2025 05:34:44 +0000 Subject: [PATCH 18/26] sched/topology: Introduce sched_update_asym_prefer_cpu() A subset of AMD Processors supporting Preferred Core Rankings also feature the ability to dynamically switch these rankings at runtime to bias load balancing towards or away from the LLC domain with larger cache. To support dynamically updating "sg->asym_prefer_cpu" without needing to rebuild the sched domain, introduce sched_update_asym_prefer_cpu() which recomutes the "asym_prefer_cpu" when the core-ranking of a CPU changes. sched_update_asym_prefer_cpu() swaps the "sg->asym_prefer_cpu" with the CPU whose ranking has changed if the new ranking is greater than that of the "asym_prefer_cpu". If CPU whose ranking has changed is the current "asym_prefer_cpu", it scans the CPUs of the sched groups to find the new "asym_prefer_cpu" and sets it accordingly. get_group() for non-overlapping sched domains returns the sched group for the first CPU in the sched_group_span() which ensures all CPUs in the group see the updated value of "asym_prefer_cpu". Overlapping groups are allocated differently and will require moving the "asym_prefer_cpu" to "sg->sgc" but since the current implementations do not set "SD_ASYM_PACKING" at NUMA domains, skip additional indirection and place a SCHED_WARN_ON() to alert any future users. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250409053446.23367-3-kprateek.nayak@amd.com --- include/linux/sched/topology.h | 6 ++++ kernel/sched/topology.c | 58 ++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7b4301b7235f..198bb5cc1774 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -195,6 +195,8 @@ struct sched_domain_topology_level { }; extern void __init set_sched_topology(struct sched_domain_topology_level *tl); +extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); + # define SD_INIT_NAME(type) .name = #type @@ -223,6 +225,10 @@ static inline bool cpus_share_resources(int this_cpu, int that_cpu) return true; } +static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ +} + #endif /* !CONFIG_SMP */ #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index bbc2fc2c7c22..a2a38e1b6f18 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1333,6 +1333,64 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) update_group_capacity(sd, cpu); } +#ifdef CONFIG_SMP + +/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ +void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) +{ + int asym_prefer_cpu = cpu; + struct sched_domain *sd; + + guard(rcu)(); + + for_each_domain(cpu, sd) { + struct sched_group *sg; + int group_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + continue; + + /* + * Groups of overlapping domain are replicated per NUMA + * node and will require updating "asym_prefer_cpu" on + * each local copy. + * + * If you are hitting this warning, consider moving + * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" + * which is shared by all the overlapping groups. + */ + WARN_ON_ONCE(sd->flags & SD_OVERLAP); + + sg = sd->groups; + if (cpu != sg->asym_prefer_cpu) { + /* + * Since the parent is a superset of the current group, + * if the cpu is not the "asym_prefer_cpu" at the + * current level, it cannot be the preferred CPU at a + * higher levels either. + */ + if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu)) + return; + + WRITE_ONCE(sg->asym_prefer_cpu, cpu); + continue; + } + + /* Ranking has improved; CPU is still the preferred one. */ + if (new_prio >= old_prio) + continue; + + for_each_cpu(group_cpu, sched_group_span(sg)) { + if (sched_asym_prefer(group_cpu, asym_prefer_cpu)) + asym_prefer_cpu = group_cpu; + } + + WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); + } +} + +#endif /* CONFIG_SMP */ + /* * Set of available CPUs grouped by their corresponding capacities * Each list entry contains a CPU mask reflecting CPUs that share the same From 8157fbc907452aa5674df2de23c1c7305c907006 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 9 Apr 2025 05:34:45 +0000 Subject: [PATCH 19/26] cpufreq/amd-pstate: Update asym_prefer_cpu when core rankings change A subset of AMD systems supporting Preferred Core rankings can have their rankings changed dynamically at runtime. Update the "sg->asym_prefer_cpu" across the local hierarchy of CPU when the preferred core ranking changes. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mario Limonciello Link: https://lore.kernel.org/r/20250409053446.23367-4-kprateek.nayak@amd.com --- drivers/cpufreq/amd-pstate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 6789eed1bb5b..8796217ccc60 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -844,8 +844,10 @@ static void amd_pstate_update_limits(unsigned int cpu) if (highest_perf_changed) { WRITE_ONCE(cpudata->prefcore_ranking, cur_high); - if (cur_high < CPPC_MAX_PERF) + if (cur_high < CPPC_MAX_PERF) { sched_set_itmt_core_prio((int)cur_high, cpu); + sched_update_asym_prefer_cpu(cpu, prev_high, cur_high); + } } } From 44671e21e3463f36f6c6e4b691216f60e85840e4 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 9 Apr 2025 05:34:46 +0000 Subject: [PATCH 20/26] sched/debug: Print the local group's asym_prefer_cpu Add a file to read local group's "asym_prefer_cpu" from debugfs. This information was useful when debugging issues where "asym_prefer_cpu" was incorrectly set to a CPU with a lower asym priority. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250409053446.23367-5-kprateek.nayak@amd.com --- kernel/sched/debug.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 56ae54e0ce6a..557246880a7e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -588,6 +588,10 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level); + + if (sd->flags & SD_ASYM_PACKING) + debugfs_create_u32("group_asym_prefer_cpu", 0444, parent, + (u32 *)&sd->groups->asym_prefer_cpu); } void update_sched_domain_debugfs(void) From c70fc32f44431bb30f9025ce753ba8be25acbba3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 Jan 2025 15:39:49 +0100 Subject: [PATCH 21/26] sched/fair: Adhere to place_entity() constraints Mike reports that commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag") relies on commit 4423af84b297 ("sched/fair: optimize the PLACE_LAG when se->vlag is zero") to not trip a WARN in place_entity(). What happens is that the lag of the very last entity is 0 per definition -- the average of one element matches the value of that element. Therefore place_entity() will match the condition skipping the lag adjustment: if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { Without the 'se->vlag' condition -- it will attempt to adjust the zero lag even though we're inserting into an empty tree. Notably, we should have failed the 'cfs_rq->nr_queued' condition, but don't because they didn't get updated. Additionally, move update_load_add() after placement() as is consistent with other place_entity() users -- this change is non-functional, place_entity() does not use cfs_rq->load. Fixes: 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag") Signed-off-by: Peter Zijlstra (Intel) Reported-by: Mike Galbraith Signed-off-by: "Peter Zijlstra (Intel)" Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/c216eb4ef0e0e0029c600aefc69d56681cee5581.camel@gmx.de --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e1bd9e8464c..eb5a2572b4f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3795,6 +3795,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_entity_lag(cfs_rq, se); se->deadline -= se->vruntime; se->rel_deadline = 1; + cfs_rq->nr_queued--; if (!curr) __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); @@ -3821,10 +3822,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { - update_load_add(&cfs_rq->load, se->load.weight); place_entity(cfs_rq, se, 0); + update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; /* * The entity's vruntime has been adjusted, so let's check From b7ca5743a2604156d6083b88cefacef983f3a3a6 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 29 Apr 2025 08:07:26 -0700 Subject: [PATCH 22/26] sched/core: Tweak wait_task_inactive() to force dequeue sched_delayed tasks It was reported that in 6.12, smpboot_create_threads() was taking much longer then in 6.6. I narrowed down the call path to: smpboot_create_threads() -> kthread_create_on_cpu() -> kthread_bind() -> __kthread_bind_mask() ->wait_task_inactive() Where in wait_task_inactive() we were regularly hitting the queued case, which sets a 1 tick timeout, which when called multiple times in a row, accumulates quickly into a long delay. I noticed disabling the DELAY_DEQUEUE sched feature recovered the performance, and it seems the newly create tasks are usually sched_delayed and left on the runqueue. So in wait_task_inactive() when we see the task p->se.sched_delayed, manually dequeue the sched_delayed task with DEQUEUE_DELAYED, so we don't have to constantly wait a tick. Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") Reported-by: peter-yc.chang@mediatek.com Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250429150736.3778580-1-jstultz@google.com --- kernel/sched/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 79692f85643f..a3507ed58424 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2283,6 +2283,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * just go back and repeat. */ rq = task_rq_lock(p, &rf); + /* + * If task is sched_delayed, force dequeue it, to avoid always + * hitting the tick timeout in the queued case + */ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); trace_sched_wait_task(p); running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); From 676e8cf70cb0533e1118e29898c9a9c33ae3a10f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 May 2025 13:36:59 +0200 Subject: [PATCH 23/26] sched,livepatch: Untangle cond_resched() and live-patching With the goal of deprecating / removing VOLUNTARY preempt, live-patch needs to stop relying on cond_resched() to make forward progress. Instead, rely on schedule() with TASK_FREEZABLE set. Just like live-patching, the freezer needs to be able to stop tasks in a safe / known state. [bigeasy: use likely() in __klp_sched_try_switch() and update comments] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Thomas Gleixner Reviewed-by: Petr Mladek Tested-by: Petr Mladek Tested-by: Miroslav Benes Acked-by: Miroslav Benes Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20250509113659.wkP_HJ5z@linutronix.de --- include/linux/livepatch_sched.h | 14 ++++----- include/linux/sched.h | 6 ---- kernel/livepatch/transition.c | 51 ++++++++++----------------------- kernel/sched/core.c | 50 ++++++-------------------------- 4 files changed, 28 insertions(+), 93 deletions(-) diff --git a/include/linux/livepatch_sched.h b/include/linux/livepatch_sched.h index 013794fb5da0..065c185f2763 100644 --- a/include/linux/livepatch_sched.h +++ b/include/linux/livepatch_sched.h @@ -3,27 +3,23 @@ #define _LINUX_LIVEPATCH_SCHED_H_ #include -#include +#include #ifdef CONFIG_LIVEPATCH void __klp_sched_try_switch(void); -#if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) - DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key); -static __always_inline void klp_sched_try_switch(void) +static __always_inline void klp_sched_try_switch(struct task_struct *curr) { - if (static_branch_unlikely(&klp_sched_try_switch_key)) + if (static_branch_unlikely(&klp_sched_try_switch_key) && + READ_ONCE(curr->__state) & TASK_FREEZABLE) __klp_sched_try_switch(); } -#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ - #else /* !CONFIG_LIVEPATCH */ -static inline void klp_sched_try_switch(void) {} -static inline void __klp_sched_try_switch(void) {} +static inline void klp_sched_try_switch(struct task_struct *curr) {} #endif /* CONFIG_LIVEPATCH */ #endif /* _LINUX_LIVEPATCH_SCHED_H_ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..b98195991031 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -44,7 +44,6 @@ #include #include #include -#include #include #include #include @@ -2089,9 +2088,6 @@ extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -void sched_dynamic_klp_enable(void); -void sched_dynamic_klp_disable(void); - DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) @@ -2112,7 +2108,6 @@ static __always_inline int _cond_resched(void) static inline int _cond_resched(void) { - klp_sched_try_switch(); return __cond_resched(); } @@ -2122,7 +2117,6 @@ static inline int _cond_resched(void) static inline int _cond_resched(void) { - klp_sched_try_switch(); return 0; } diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index ba069459c101..2351a19ac2a9 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -29,22 +29,13 @@ static unsigned int klp_signals_cnt; /* * When a livepatch is in progress, enable klp stack checking in - * cond_resched(). This helps CPU-bound kthreads get patched. + * schedule(). This helps CPU-bound kthreads get patched. */ -#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) - -#define klp_cond_resched_enable() sched_dynamic_klp_enable() -#define klp_cond_resched_disable() sched_dynamic_klp_disable() - -#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key); -EXPORT_SYMBOL(klp_sched_try_switch_key); -#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key) -#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key) - -#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ +#define klp_resched_enable() static_branch_enable(&klp_sched_try_switch_key) +#define klp_resched_disable() static_branch_disable(&klp_sched_try_switch_key) /* * This work can be performed periodically to finish patching or unpatching any @@ -365,27 +356,19 @@ static bool klp_try_switch_task(struct task_struct *task) void __klp_sched_try_switch(void) { + /* + * This function is called from __schedule() while a context switch is + * about to happen. Preemption is already disabled and klp_mutex + * can't be acquired. + * Disabled preemption is used to prevent racing with other callers of + * klp_try_switch_task(). Thanks to task_call_func() they won't be + * able to switch to this task while it's running. + */ + lockdep_assert_preemption_disabled(); + if (likely(!klp_patch_pending(current))) return; - /* - * This function is called from cond_resched() which is called in many - * places throughout the kernel. Using the klp_mutex here might - * deadlock. - * - * Instead, disable preemption to prevent racing with other callers of - * klp_try_switch_task(). Thanks to task_call_func() they won't be - * able to switch this task while it's running. - */ - preempt_disable(); - - /* - * Make sure current didn't get patched between the above check and - * preempt_disable(). - */ - if (unlikely(!klp_patch_pending(current))) - goto out; - /* * Enforce the order of the TIF_PATCH_PENDING read above and the * klp_target_state read in klp_try_switch_task(). The corresponding @@ -395,11 +378,7 @@ void __klp_sched_try_switch(void) smp_rmb(); klp_try_switch_task(current); - -out: - preempt_enable(); } -EXPORT_SYMBOL(__klp_sched_try_switch); /* * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. @@ -508,7 +487,7 @@ void klp_try_complete_transition(void) } /* Done! Now cleanup the data structures. */ - klp_cond_resched_disable(); + klp_resched_disable(); patch = klp_transition_patch; klp_complete_transition(); @@ -560,7 +539,7 @@ void klp_start_transition(void) set_tsk_thread_flag(task, TIF_PATCH_PENDING); } - klp_cond_resched_enable(); + klp_resched_enable(); klp_signals_cnt = 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a3507ed58424..bece0ba6f5b3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -66,6 +66,7 @@ #include #include #include +#include #ifdef CONFIG_PREEMPT_DYNAMIC # ifdef CONFIG_GENERIC_ENTRY @@ -6676,6 +6677,8 @@ static void __sched notrace __schedule(int sched_mode) if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); + klp_sched_try_switch(prev); + local_irq_disable(); rcu_note_context_switch(preempt); @@ -7336,7 +7339,6 @@ EXPORT_STATIC_CALL_TRAMP(might_resched); static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { - klp_sched_try_switch(); if (!static_branch_unlikely(&sk_dynamic_cond_resched)) return 0; return __cond_resched(); @@ -7508,7 +7510,6 @@ int sched_dynamic_mode(const char *str) #endif static DEFINE_MUTEX(sched_dynamic_mutex); -static bool klp_override; static void __sched_dynamic_update(int mode) { @@ -7516,8 +7517,7 @@ static void __sched_dynamic_update(int mode) * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in * the ZERO state, which is invalid. */ - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7526,8 +7526,7 @@ static void __sched_dynamic_update(int mode) switch (mode) { case preempt_dynamic_none: - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); @@ -7538,8 +7537,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_voluntary: - if (!klp_override) - preempt_dynamic_enable(cond_resched); + preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notrace); @@ -7550,8 +7548,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_full: - if (!klp_override) - preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7562,8 +7559,7 @@ static void __sched_dynamic_update(int mode) break; case preempt_dynamic_lazy: - if (!klp_override) - preempt_dynamic_disable(cond_resched); + preempt_dynamic_disable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); @@ -7584,36 +7580,6 @@ void sched_dynamic_update(int mode) mutex_unlock(&sched_dynamic_mutex); } -#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL - -static int klp_cond_resched(void) -{ - __klp_sched_try_switch(); - return __cond_resched(); -} - -void sched_dynamic_klp_enable(void) -{ - mutex_lock(&sched_dynamic_mutex); - - klp_override = true; - static_call_update(cond_resched, klp_cond_resched); - - mutex_unlock(&sched_dynamic_mutex); -} - -void sched_dynamic_klp_disable(void) -{ - mutex_lock(&sched_dynamic_mutex); - - klp_override = false; - __sched_dynamic_update(preempt_dynamic_mode); - - mutex_unlock(&sched_dynamic_mutex); -} - -#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ - static int __init setup_preempt_mode(char *str) { int mode = sched_dynamic_mode(str); From aa3ee4f0b7541382c9f6f43f7408d73a5d4f4042 Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Mon, 3 Mar 2025 18:52:39 +0800 Subject: [PATCH 24/26] sched/fair: Fixup wake_up_sync() vs DELAYED_DEQUEUE Delayed dequeued feature keeps a sleeping task enqueued until its lag has elapsed. As a result, it stays also visible in rq->nr_running. So when in wake_affine_idle(), we should use the real running-tasks in rq to check whether we should place the wake-up task to current cpu. On the other hand, add a helper function to return the nr-delayed. Fixes: 152e11f6df29 ("sched/fair: Implement delayed dequeue") Signed-off-by: Xuewen Yan Reviewed-and-tested-by: Tianchen Ding Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20250303105241.17251-2-xuewen.yan@unisoc.com --- kernel/sched/fair.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eb5a2572b4f8..b00f16700f9c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7193,6 +7193,11 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) return true; } +static inline unsigned int cfs_h_nr_delayed(struct rq *rq) +{ + return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); +} + #ifdef CONFIG_SMP /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ @@ -7354,8 +7359,12 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; - if (sync && cpu_rq(this_cpu)->nr_running == 1) - return this_cpu; + if (sync) { + struct rq *rq = cpu_rq(this_cpu); + + if ((rq->nr_running - cfs_h_nr_delayed(rq)) == 1) + return this_cpu; + } if (available_idle_cpu(prev_cpu)) return prev_cpu; From 0212696a844631a923aa6cedd74ebbb3cf434e51 Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Thu, 17 Apr 2025 12:34:56 +0800 Subject: [PATCH 25/26] sched/util_est: Simplify condition for util_est_{en,de}queue() To prevent double enqueue/dequeue of the util-est for sched_delayed tasks, commit 729288bc6856 ("kernel/sched: Fix util_est accounting for DELAY_DEQUEUE") added the corresponding check. This check excludes double en/dequeue during task migration and priority changes. In fact, these conditions can be simplified. For util_est_dequeue, we know that sched_delayed flag is set in dequeue_entity. When the task is sleeping, we need to call util_est_dequeue to subtract util-est from the cfs_rq. At this point, sched_delayed has not yet been set. If we find that sched_delayed is already set, it indicates that this task has already called dequeue_task_fair once. In this case, there is no need to call util_est_dequeue again. Therefore, simply checking the sched_delayed flag should be sufficient to prevent unnecessary util_est updates during the dequeue. For util_est_enqueue, our goal is to add the util_est to the cfs_rq when task enqueue. However, we don't want to add the util_est of a sched_delayed task to the cfs_rq because the task is sleeping. Therefore, we can exclude the util_est_enqueue for sched_delayed tasks by checking the sched_delayed flag. However, when waking up a delayed task, the sched_delayed flag is cleared after util_est_enqueue. As a result, if we only check the sched_delayed flag, we would miss the util_est_enqueue. Since waking up a sched_delayed task calls enqueue_task with the ENQUEUE_DELAYED flag, we can determine whether to call util_est_enqueue by checking if the enqueue_flag contains ENQUEUE_DELAYED. Signed-off-by: Xuewen Yan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20250417043457.10632-2-xuewen.yan@unisoc.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b00f16700f9c..a028c294aef3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6936,7 +6936,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * Let's add the task's estimated utilization to the cfs_rq's * estimated utilization, before we update schedutil. */ - if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) + if (!p->se.sched_delayed || (flags & ENQUEUE_DELAYED)) util_est_enqueue(&rq->cfs, p); if (flags & ENQUEUE_DELAYED) { @@ -7178,7 +7178,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) */ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { - if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) + if (!p->se.sched_delayed) util_est_dequeue(&rq->cfs, p); util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); From 90ca9410dab21c407706726b86b6e50c6698b5af Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Thu, 17 Apr 2025 12:34:57 +0800 Subject: [PATCH 26/26] sched/uclamp: Align uclamp and util_est and call before freq update The commit dfa0a574cbc47 ("sched/uclamg: Handle delayed dequeue") has add the sched_delayed check to prevent double uclamp_dec/inc. However, it put the uclamp_rq_inc() after enqueue_task(). This may lead to the following issues: When a task with uclamp goes through enqueue_task() and could trigger cpufreq update, its uclamp won't even be considered in the cpufreq update. It is only after enqueue will the uclamp be added to rq buckets, and cpufreq will only pick it up at the next update. This could cause a delay in frequency updating. It may affect the performance(uclamp_min > 0) or power(uclamp_max < 1024). So, just like util_est, put the uclamp_rq_inc() before enqueue_task(). And as for the sched_delayed_task, same as util_est, using the sched_delayed flag to prevent inc the sched_delayed_task's uclamp, using the ENQUEUE_DELAYED flag to allow inc the sched_delayed_task's uclamp which is being woken up. Signed-off-by: Xuewen Yan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20250417043457.10632-3-xuewen.yan@unisoc.com --- kernel/sched/core.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bece0ba6f5b3..64c875749ac2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1753,7 +1753,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, } } -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { enum uclamp_id clamp_id; @@ -1769,7 +1769,8 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) if (unlikely(!p->sched_class->uclamp_enabled)) return; - if (p->se.sched_delayed) + /* Only inc the delayed task which being woken up. */ + if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED)) return; for_each_clamp_id(clamp_id) @@ -2037,7 +2038,7 @@ static void __init init_uclamp(void) } #else /* !CONFIG_UCLAMP_TASK */ -static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } @@ -2073,12 +2074,14 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); - p->sched_class->enqueue_task(rq, p, flags); /* - * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear - * ->sched_delayed. + * Can be before ->enqueue_task() because uclamp considers the + * ENQUEUE_DELAYED task before its ->sched_delayed gets cleared + * in ->enqueue_task(). */ - uclamp_rq_inc(rq, p); + uclamp_rq_inc(rq, p, flags); + + p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags);