sched_ext: Changes for v6.19

- Improve recovery from misbehaving BPF schedulers. When a scheduler puts many tasks with varying affinity restrictions on a shared DSQ, CPUs scanning through tasks they cannot run can overwhelm the system, causing lockups. Bypass mode now uses per-CPU DSQs with a load balancer to avoid this, and hooks into the hardlockup detector to attempt recovery. Add scx_cpu0 example scheduler to demonstrate this scenario. - Add lockless peek operation for DSQs to reduce lock contention for schedulers that need to query queue state during load balancing. - Allow scx_bpf_reenqueue_local() to be called from anywhere in preparation for deprecating cpu_acquire/release() callbacks in favor of generic BPF hooks. - Prepare for hierarchical scheduler support: add scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs, make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in structs for future aux__prog parameter. - Implement cgroup_set_idle() callback to notify BPF schedulers when a cgroup's idle state changes. - Fix migration tasks being incorrectly downgraded from stop_sched_class to rt_sched_class across sched_ext enable/disable. Applied late as the fix is low risk and the bug subtle but needs stable backporting. - Various fixes and cleanups including cgroup exit ordering, SCX_KICK_WAIT reliability, and backward compatibility improvements. -----BEGIN PGP SIGNATURE----- iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCaS4h1A4cdGpAa2VybmVs Lm9yZwAKCRCxYfJx3gVYGe/MAP9EZ0pLiTpmMtt6mI/11Fmi+aWfL84j1zt13cz9 W4vb4gEA9eVEH6n9xyC4nhcOk9AQwSDuCWMOzLsnhW8TbEHVTww= =8W/B -----END PGP SIGNATURE----- Merge tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext Pull sched_ext updates from Tejun Heo: - Improve recovery from misbehaving BPF schedulers. When a scheduler puts many tasks with varying affinity restrictions on a shared DSQ, CPUs scanning through tasks they cannot run can overwhelm the system, causing lockups. Bypass mode now uses per-CPU DSQs with a load balancer to avoid this, and hooks into the hardlockup detector to attempt recovery. Add scx_cpu0 example scheduler to demonstrate this scenario. - Add lockless peek operation for DSQs to reduce lock contention for schedulers that need to query queue state during load balancing. - Allow scx_bpf_reenqueue_local() to be called from anywhere in preparation for deprecating cpu_acquire/release() callbacks in favor of generic BPF hooks. - Prepare for hierarchical scheduler support: add scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs, make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in structs for future aux__prog parameter. - Implement cgroup_set_idle() callback to notify BPF schedulers when a cgroup's idle state changes. - Fix migration tasks being incorrectly downgraded from stop_sched_class to rt_sched_class across sched_ext enable/disable. Applied late as the fix is low risk and the bug subtle but needs stable backporting. - Various fixes and cleanups including cgroup exit ordering, SCX_KICK_WAIT reliability, and backward compatibility improvements. * tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (44 commits) sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks sched_ext: tools: Removing duplicate targets during non-cross compilation sched_ext: Use kvfree_rcu() to release per-cpu ksyncs object sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs sched_ext: Update comments replacing breather with aborting mechanism sched_ext: Implement load balancer for bypass mode sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked() sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR sched_ext: Add scx_cpu0 example scheduler sched_ext: Hook up hardlockup detector sched_ext: Make handle_lockup() propagate scx_verror() result sched_ext: Refactor lockup handlers into handle_lockup() sched_ext: Make scx_exit() and scx_vexit() return bool sched_ext: Exit dispatch and move operations immediately when aborting sched_ext: Simplify breather mechanism with scx_aborting flag sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode sched_ext: Refactor do_enqueue_task() local and global DSQ paths sched_ext: Use shorter slice in bypass mode sched_ext: Mark racy bitfields to prevent adding fields that can't tolerate races sched_ext: Minor cleanups to scx_task_iter ...
2025-12-03 13:25:39 -08:00 · 2025-12-03 13:25:39 -08:00 · 02baaa67d9
parent 8449d3252c 1dd6c84f1c
commit 02baaa67d9
20 changed files with 1905 additions and 423 deletions
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@ -17,7 +17,18 @@
 enum scx_public_consts {
 	SCX_OPS_NAME_LEN	= 128,
 	/*
 	 * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses
 	 * to set the slice for a task that is selected for execution.
 	 * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice
 	 * refill has been triggered.
 	 *
 	 * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass
 	 * mode. As making forward progress for all tasks is the main goal of
 	 * the bypass mode, a shorter slice is used.
 	 */
 	SCX_SLICE_DFL		= 20 * 1000000,	/* 20ms */
 	SCX_SLICE_BYPASS	=  5 * 1000000, /*  5ms */
 	SCX_SLICE_INF		= U64_MAX,	/* infinite, implies nohz */
 };
@ -46,6 +57,7 @@ enum scx_dsq_id_flags {
 	SCX_DSQ_INVALID		= SCX_DSQ_FLAG_BUILTIN | 0,
 	SCX_DSQ_GLOBAL		= SCX_DSQ_FLAG_BUILTIN | 1,
 	SCX_DSQ_LOCAL		= SCX_DSQ_FLAG_BUILTIN | 2,
 	SCX_DSQ_BYPASS		= SCX_DSQ_FLAG_BUILTIN | 3,
 	SCX_DSQ_LOCAL_ON	= SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
 	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
 };
@ -58,6 +70,7 @@ enum scx_dsq_id_flags {
 */
 struct scx_dispatch_q {
 	raw_spinlock_t		lock;
 	struct task_struct __rcu *first_task; /* lockless peek at head */
 	struct list_head	list;	/* tasks in dispatch order */
 	struct rb_root		priq;	/* used to order by p->scx.dsq_vtime */
 	u32			nr;
@ -136,6 +149,13 @@ struct scx_dsq_list_node {
 	u32			priv;		/* can be used by iter cursor */
 };
 #define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv)				\
 	(struct scx_dsq_list_node) {						\
 		.node = LIST_HEAD_INIT((__node).node),				\
 		.flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags),			\
 		.priv = (__priv),						\
 	}
 /*
 * The following is embedded in task_struct and contains all fields necessary
 * for a task to be scheduled by SCX.
@ -207,16 +227,18 @@ struct sched_ext_entity {
 	struct list_head	tasks_node;
 };
-void sched_ext_free(struct task_struct *p);
+void sched_ext_dead(struct task_struct *p);
 void print_scx_info(const char *log_lvl, struct task_struct *p);
 void scx_softlockup(u32 dur_s);
 bool scx_hardlockup(int cpu);
 bool scx_rcu_cpu_stall(void);
 #else	/* !CONFIG_SCHED_CLASS_EXT */
-static inline void sched_ext_free(struct task_struct *p) {}
+static inline void sched_ext_dead(struct task_struct *p) {}
 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
 static inline void scx_softlockup(u32 dur_s) {}
 static inline bool scx_hardlockup(int cpu) { return false; }
 static inline bool scx_rcu_cpu_stall(void) { return false; }
 #endif	/* CONFIG_SCHED_CLASS_EXT */
@ -228,6 +250,7 @@ struct scx_task_group {
 	u64			bw_period_us;
 	u64			bw_quota_us;
 	u64			bw_burst_us;
 	bool			idle;
 #endif
 };
--- a/include/trace/events/sched_ext.h
+++ b/include/trace/events/sched_ext.h
@ -45,6 +45,45 @@ TRACE_EVENT(sched_ext_event,
 	)
 );
 TRACE_EVENT(sched_ext_bypass_lb,
 	TP_PROTO(__u32 node, __u32 nr_cpus, __u32 nr_tasks, __u32 nr_balanced,
 		 __u32 before_min, __u32 before_max,
 		 __u32 after_min, __u32 after_max),
 	TP_ARGS(node, nr_cpus, nr_tasks, nr_balanced,
 		before_min, before_max, after_min, after_max),
 	TP_STRUCT__entry(
 		__field(	__u32,		node		)
 		__field(	__u32,		nr_cpus		)
 		__field(	__u32,		nr_tasks	)
 		__field(	__u32,		nr_balanced	)
 		__field(	__u32,		before_min	)
 		__field(	__u32,		before_max	)
 		__field(	__u32,		after_min	)
 		__field(	__u32,		after_max	)
 	),
 	TP_fast_assign(
 		__entry->node		= node;
 		__entry->nr_cpus	= nr_cpus;
 		__entry->nr_tasks	= nr_tasks;
 		__entry->nr_balanced	= nr_balanced;
 		__entry->before_min	= before_min;
 		__entry->before_max	= before_max;
 		__entry->after_min	= after_min;
 		__entry->after_max	= after_max;
 	),
 	TP_printk("node %u: nr_cpus=%u nr_tasks=%u nr_balanced=%u min=%u->%u max=%u->%u",
 		  __entry->node, __entry->nr_cpus,
 		  __entry->nr_tasks, __entry->nr_balanced,
 		  __entry->before_min, __entry->after_min,
 		  __entry->before_max, __entry->after_max
 	)
 );
 #endif /* _TRACE_SCHED_EXT_H */
 /* This part must be outside protection */
--- a/kernel/fork.c
+++ b/kernel/fork.c
@ -736,7 +736,6 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(tsk == current);
 	unwind_task_free(tsk);
 	sched_ext_free(tsk);
 	io_uring_free(tsk);
 	cgroup_task_free(tsk);
 	task_numa_free(tsk, true);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -5143,6 +5143,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
 		/*
 		 * sched_ext_dead() must come before cgroup_task_dead() to
 		 * prevent cgroups from being removed while its member tasks are
 		 * visible to SCX schedulers.
 		 */
 		sched_ext_dead(prev);
 		cgroup_task_dead(prev);
 		/* Task is done with its stack. */
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@ -995,26 +995,56 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 	return prev_cpu;
 }
 struct scx_bpf_select_cpu_and_args {
 	/* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */
 	s32			prev_cpu;
 	u64			wake_flags;
 	u64			flags;
 };
 /**
- * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
+ * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask
 *			    prioritizing those in @cpus_allowed
 * @p: task_struct to select a CPU for
 * @prev_cpu: CPU @p was on previously
 * @wake_flags: %SCX_WAKE_* flags
 * @cpus_allowed: cpumask of allowed CPUs
- * @flags: %SCX_PICK_IDLE* flags
+ * @args: struct containing the rest of the arguments
 *       @args->prev_cpu: CPU @p was on previously
 *       @args->wake_flags: %SCX_WAKE_* flags
 *       @args->flags: %SCX_PICK_IDLE* flags
 *
 * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
 * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided
 * as an inline wrapper in common.bpf.h.
 *
 * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
 * context such as a BPF test_run() call, as long as built-in CPU selection
 * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
 * is set.
 *
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu().
 *
 * Returns the selected idle CPU, which will be automatically awakened upon
 * returning from ops.select_cpu() and can be used for direct dispatch, or
 * a negative value if no idle CPU is available.
 */
 __bpf_kfunc s32
 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
 			 struct scx_bpf_select_cpu_and_args *args)
 {
 	struct scx_sched *sch;
 	guard(rcu)();
 	sch = rcu_dereference(scx_root);
 	if (unlikely(!sch))
 		return -ENODEV;
 	return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags,
 				     cpus_allowed, args->flags);
 }
 /*
 * COMPAT: Will be removed in v6.22.
 */
 __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
 				       const struct cpumask *cpus_allowed, u64 flags)
 {
@ -1383,6 +1413,7 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
 BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_idle)
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@ -23,6 +23,11 @@ enum scx_consts {
 	 * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
 	 */
 	SCX_TASK_ITER_BATCH		= 32,
 	SCX_BYPASS_LB_DFL_INTV_US	= 500 * USEC_PER_MSEC,
 	SCX_BYPASS_LB_DONOR_PCT		= 125,
 	SCX_BYPASS_LB_MIN_DELTA_DIV	= 4,
 	SCX_BYPASS_LB_BATCH		= 256,
 };
 enum scx_exit_kind {
@ -697,12 +702,23 @@ struct sched_ext_ops {
 	 * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
 	 * interpreted in the same fashion and specifies how much @cgrp can
 	 * burst temporarily. The specific control mechanism and thus the
-	 * interpretation of @period_us and burstiness is upto to the BPF
+	 * interpretation of @period_us and burstiness is up to the BPF
 	 * scheduler.
 	 */
 	void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
 				     u64 period_us, u64 quota_us, u64 burst_us);
 	/**
 	 * @cgroup_set_idle: A cgroup's idle state is being changed
 	 * @cgrp: cgroup whose idle state is being updated
 	 * @idle: whether the cgroup is entering or exiting idle state
 	 *
 	 * Update @cgrp's idle state to @idle. This callback is invoked when
 	 * a cgroup transitions between idle and non-idle states, allowing the
 	 * BPF scheduler to adjust its behavior accordingly.
 	 */
 	void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle);
 #endif	/* CONFIG_EXT_GROUP_SCHED */
 	/*
@ -884,6 +900,10 @@ struct scx_sched {
 	struct scx_dispatch_q	**global_dsqs;
 	struct scx_sched_pcpu __percpu *pcpu;
 	/*
 	 * Updates to the following warned bitfields can race causing RMW issues
 	 * but it doesn't really matter.
 	 */
 	bool			warned_zero_slice:1;
 	bool			warned_deprecated_rq:1;
@ -948,6 +968,7 @@ enum scx_enq_flags {
 	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
 	SCX_ENQ_DSQ_PRIQ	= 1LLU << 57,
 	SCX_ENQ_NESTED		= 1LLU << 58,
 };
 enum scx_deq_flags {
@ -986,8 +1007,10 @@ enum scx_kick_flags {
 	SCX_KICK_PREEMPT	= 1LLU << 1,
 	/*
-	 * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
+	 * The scx_bpf_kick_cpu() call will return after the current SCX task of
-	 * return after the target CPU finishes picking the next task.
+	 * the target CPU switches out. This can be used to implement e.g. core
 	 * scheduling. This has no effect if the current task on the target CPU
 	 * is not on SCX.
 	 */
 	SCX_KICK_WAIT		= 1LLU << 2,
 };
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -803,10 +803,12 @@ struct scx_rq {
 	cpumask_var_t		cpus_to_kick_if_idle;
 	cpumask_var_t		cpus_to_preempt;
 	cpumask_var_t		cpus_to_wait;
-	unsigned long		pnt_seq;
+	unsigned long		kick_sync;
 	local_t			reenq_local_deferred;
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
 	struct irq_work		kick_cpus_irq_work;
 	struct scx_dispatch_q	bypass_dsq;
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@ -196,6 +196,15 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
 #ifdef CONFIG_SYSFS
 		++hardlockup_count;
 #endif
 		/*
 		 * A poorly behaving BPF scheduler can trigger hard lockup by
 		 * e.g. putting numerous affinitized tasks in a single queue and
 		 * directing all CPUs at it. The following call can return true
 		 * only once when sched_ext is enabled and will immediately
 		 * abort the BPF scheduler and print out a warning message.
 		 */
 		if (scx_hardlockup(cpu))
 			return;
 		/* Only print hardlockups once. */
 		if (per_cpu(watchdog_hardlockup_warned, cpu))
--- a/tools/sched_ext/Makefile
+++ b/tools/sched_ext/Makefile
@ -133,6 +133,7 @@ $(MAKE_DIRS):
 	$(call msg,MKDIR,,$@)
 	$(Q)mkdir -p $@
 ifneq ($(CROSS_COMPILE),)
 $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
 	   $(APIDIR)/linux/bpf.h						\
 	   | $(OBJ_DIR)/libbpf
@ -141,6 +142,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)			\
 		    EXTRA_CFLAGS='-g -O0 -fPIC'					\
 		    LDFLAGS="$(LDFLAGS)"					\
 		    DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
 endif
 $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile)		\
 	   $(APIDIR)/linux/bpf.h						\
@ -187,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
 SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
-c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg
+c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg
 $(addprefix $(BINDIR)/,$(c-sched-targets)): \
 	$(BINDIR)/%: \
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@ -60,21 +60,15 @@ static inline void ___vmlinux_h_sanity_check___(void)
 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
-s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
-			   const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
+			     struct scx_bpf_select_cpu_and_args *args) __ksym __weak;
-void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak;
 void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
 bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak;
 void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
 void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
 bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_reenqueue_local(void) __ksym;
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
 struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
 int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
 struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
 void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
@ -105,7 +99,6 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
 struct rq *scx_bpf_locked_rq(void) __ksym;
 struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
 u64 scx_bpf_now(void) __ksym __weak;
 void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@ -16,119 +16,92 @@
 })
 /* v6.12: 819513666966 ("sched_ext: Add cgroup support") */
-#define __COMPAT_scx_bpf_task_cgroup(p)						\
+struct cgroup *scx_bpf_task_cgroup___new(struct task_struct *p) __ksym __weak;
-	(bpf_ksym_exists(scx_bpf_task_cgroup) ?					\
+
-	 scx_bpf_task_cgroup((p)) : NULL)
+#define scx_bpf_task_cgroup(p)							\
 	(bpf_ksym_exists(scx_bpf_task_cgroup___new) ?				\
 	 scx_bpf_task_cgroup___new((p)) : NULL)
 /*
 * v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are
 * renamed to unload the verb.
 *
 * Build error is triggered if old names are used. New binaries work with both
 * new and old names. The compat macros will be removed on v6.15 release.
 *
 * scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by
 * 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()").
 * Preserve __COMPAT macros until v6.15.
 */
-void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dsq_move_to_local___new(u64 dsq_id) __ksym __weak;
-void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
+void scx_bpf_dsq_move_set_slice___new(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
-bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak;
+void scx_bpf_dsq_move_set_vtime___new(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
-void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
+bool scx_bpf_dsq_move___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
+bool scx_bpf_dsq_move_vtime___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
-#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags)				\
+bool scx_bpf_consume___old(u64 dsq_id) __ksym __weak;
-	(bpf_ksym_exists(scx_bpf_dsq_insert) ?					\
+void scx_bpf_dispatch_from_dsq_set_slice___old(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
-	 scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) :		\
+void scx_bpf_dispatch_from_dsq_set_vtime___old(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
-	 scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags)))
+bool scx_bpf_dispatch_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-
+bool scx_bpf_dispatch_vtime_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 #define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags)		\
 	(bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ?				\
 	 scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \
 	 scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags)))
 #define scx_bpf_dsq_move_to_local(dsq_id)					\
-	(bpf_ksym_exists(scx_bpf_dsq_move_to_local) ?				\
+	(bpf_ksym_exists(scx_bpf_dsq_move_to_local___new) ?			\
-	 scx_bpf_dsq_move_to_local((dsq_id)) :					\
+	 scx_bpf_dsq_move_to_local___new((dsq_id)) :				\
-	 scx_bpf_consume___compat((dsq_id)))
+	 scx_bpf_consume___old((dsq_id)))
-#define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice)			\
+#define scx_bpf_dsq_move_set_slice(it__iter, slice)				\
-	(bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ?				\
+	(bpf_ksym_exists(scx_bpf_dsq_move_set_slice___new) ?			\
-	 scx_bpf_dsq_move_set_slice((it__iter), (slice)) :			\
+	 scx_bpf_dsq_move_set_slice___new((it__iter), (slice)) :		\
-	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ?	\
+	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___old) ?		\
-	  scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) :	\
+	  scx_bpf_dispatch_from_dsq_set_slice___old((it__iter), (slice)) :	\
 	  (void)0))
-#define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime)			\
+#define scx_bpf_dsq_move_set_vtime(it__iter, vtime)				\
-	(bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ?				\
+	(bpf_ksym_exists(scx_bpf_dsq_move_set_vtime___new) ?			\
-	 scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) :			\
+	 scx_bpf_dsq_move_set_vtime___new((it__iter), (vtime)) :		\
-	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ?	\
+	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___old) ?		\
-	  scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) :	\
+	  scx_bpf_dispatch_from_dsq_set_vtime___old((it__iter), (vtime)) :	\
-	  (void) 0))
+	  (void)0))
-#define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags)		\
+#define scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags)			\
-	(bpf_ksym_exists(scx_bpf_dsq_move) ?					\
+	(bpf_ksym_exists(scx_bpf_dsq_move___new) ?				\
-	 scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) :		\
+	 scx_bpf_dsq_move___new((it__iter), (p), (dsq_id), (enq_flags)) :	\
-	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ?			\
+	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___old) ?			\
-	  scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
+	  scx_bpf_dispatch_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \
 	  false))
-#define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags)		\
+#define scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags)			\
-	(bpf_ksym_exists(scx_bpf_dsq_move_vtime) ?				\
+	(bpf_ksym_exists(scx_bpf_dsq_move_vtime___new) ?			\
-	 scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) :	\
+	 scx_bpf_dsq_move_vtime___new((it__iter), (p), (dsq_id), (enq_flags)) : \
-	 (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ?		\
+	 (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___old) ?		\
-	  scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
+	  scx_bpf_dispatch_vtime_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \
 	  false))
 /*
 * v6.15: 950ad93df2fc ("bpf: add kfunc for populating cpumask bits")
 *
 * Compat macro will be dropped on v6.19 release.
 */
 int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
 #define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz)		\
 	(bpf_ksym_exists(bpf_cpumask_populate) ?			\
 	 (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP)
-#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags)				\
+/*
-	_Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
+ * v6.19: Introduce lockless peek API for user DSQs.
 *
 * Preserve the following macro until v6.21.
 */
 static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id)
 {
 	struct task_struct *p = NULL;
 	struct bpf_iter_scx_dsq it;
-#define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags)		\
+	if (bpf_ksym_exists(scx_bpf_dsq_peek))
-	_Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()")
+		return scx_bpf_dsq_peek(dsq_id);
-
+	if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0))
-#define scx_bpf_consume(dsq_id) ({						\
+		p = bpf_iter_scx_dsq_next(&it);
-	_Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \
+	bpf_iter_scx_dsq_destroy(&it);
-	false;									\
+	return p;
-})
+}
 #define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice)		\
 	_Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()")
 #define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime)		\
 	_Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()")
 #define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({	\
 	_Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \
 	false;									\
 })
 #define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({  \
 	_Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \
 	false;									\
 })
 #define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice)		\
 	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()")
 #define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime)		\
 	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()")
 #define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({	\
 	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \
 	false;									\
 })
 #define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({  \
 	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \
 	false;									\
 })
 /**
 * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
@ -247,6 +220,161 @@ static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu)
 	return rq ? rq->curr : NULL;
 }
 /*
 * v6.19: To work around BPF maximum parameter limit, the following kfuncs are
 * replaced with variants that pack scalar arguments in a struct. Wrappers are
 * provided to maintain source compatibility.
 *
 * v6.13: scx_bpf_dsq_insert_vtime() renaming is also handled here. See the
 * block on dispatch renaming above for more details.
 *
 * The kernel will carry the compat variants until v6.23 to maintain binary
 * compatibility. After v6.23 release, remove the compat handling and move the
 * wrappers to common.bpf.h.
 */
 s32 scx_bpf_select_cpu_and___compat(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
 				    const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
 void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
 void scx_bpf_dsq_insert_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
 /**
 * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p
 * @p: task_struct to select a CPU for
 * @prev_cpu: CPU @p was on previously
 * @wake_flags: %SCX_WAKE_* flags
 * @cpus_allowed: cpumask of allowed CPUs
 * @flags: %SCX_PICK_IDLE* flags
 *
 * Inline wrapper that packs scalar arguments into a struct and calls
 * __scx_bpf_select_cpu_and(). See __scx_bpf_select_cpu_and() for details.
 */
 static inline s32
 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
 		       const struct cpumask *cpus_allowed, u64 flags)
 {
 	if (bpf_core_type_exists(struct scx_bpf_select_cpu_and_args)) {
 		struct scx_bpf_select_cpu_and_args args = {
 			.prev_cpu = prev_cpu,
 			.wake_flags = wake_flags,
 			.flags = flags,
 		};
 		return __scx_bpf_select_cpu_and(p, cpus_allowed, &args);
 	} else {
 		return scx_bpf_select_cpu_and___compat(p, prev_cpu, wake_flags,
 						       cpus_allowed, flags);
 	}
 }
 /**
 * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
 * @p: task_struct to insert
 * @dsq_id: DSQ to insert into
 * @slice: duration @p can run for in nsecs, 0 to keep the current value
 * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
 * @enq_flags: SCX_ENQ_*
 *
 * Inline wrapper that packs scalar arguments into a struct and calls
 * __scx_bpf_dsq_insert_vtime(). See __scx_bpf_dsq_insert_vtime() for details.
 */
 static inline bool
 scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime,
 			 u64 enq_flags)
 {
 	if (bpf_core_type_exists(struct scx_bpf_dsq_insert_vtime_args)) {
 		struct scx_bpf_dsq_insert_vtime_args args = {
 			.dsq_id = dsq_id,
 			.slice = slice,
 			.vtime = vtime,
 			.enq_flags = enq_flags,
 		};
 		return __scx_bpf_dsq_insert_vtime(p, &args);
 	} else if (bpf_ksym_exists(scx_bpf_dsq_insert_vtime___compat)) {
 		scx_bpf_dsq_insert_vtime___compat(p, dsq_id, slice, vtime,
 						  enq_flags);
 		return true;
 	} else {
 		scx_bpf_dispatch_vtime___compat(p, dsq_id, slice, vtime,
 						enq_flags);
 		return true;
 	}
 }
 /*
 * v6.19: scx_bpf_dsq_insert() now returns bool instead of void. Move
 * scx_bpf_dsq_insert() decl to common.bpf.h and drop compat helper after v6.22.
 * The extra ___compat suffix is to work around libbpf not ignoring __SUFFIX on
 * kernel side. The entire suffix can be dropped later.
 *
 * v6.13: scx_bpf_dsq_insert() renaming is also handled here. See the block on
 * dispatch renaming above for more details.
 */
 bool scx_bpf_dsq_insert___v2___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
 void scx_bpf_dsq_insert___v1(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
 void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
 static inline bool
 scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags)
 {
 	if (bpf_ksym_exists(scx_bpf_dsq_insert___v2___compat)) {
 		return scx_bpf_dsq_insert___v2___compat(p, dsq_id, slice, enq_flags);
 	} else if (bpf_ksym_exists(scx_bpf_dsq_insert___v1)) {
 		scx_bpf_dsq_insert___v1(p, dsq_id, slice, enq_flags);
 		return true;
 	} else {
 		scx_bpf_dispatch___compat(p, dsq_id, slice, enq_flags);
 		return true;
 	}
 }
 /*
 * v6.19: scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() added to for
 * sub-sched authority checks. Drop the wrappers and move the decls to
 * common.bpf.h after v6.22.
 */
 bool scx_bpf_task_set_slice___new(struct task_struct *p, u64 slice) __ksym __weak;
 bool scx_bpf_task_set_dsq_vtime___new(struct task_struct *p, u64 vtime) __ksym __weak;
 static inline void scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
 {
 	if (bpf_ksym_exists(scx_bpf_task_set_slice___new))
 		scx_bpf_task_set_slice___new(p, slice);
 	else
 		p->scx.slice = slice;
 }
 static inline void scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
 {
 	if (bpf_ksym_exists(scx_bpf_task_set_dsq_vtime___new))
 		scx_bpf_task_set_dsq_vtime___new(p, vtime);
 	else
 		p->scx.dsq_vtime = vtime;
 }
 /*
 * v6.19: The new void variant can be called from anywhere while the older v1
 * variant can only be called from ops.cpu_release(). The double ___ prefixes on
 * the v2 variant need to be removed once libbpf is updated to ignore ___ prefix
 * on kernel side. Drop the wrapper and move the decl to common.bpf.h after
 * v6.22.
 */
 u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak;
 void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak;
 static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void)
 {
 	return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat);
 }
 static inline void scx_bpf_reenqueue_local(void)
 {
 	if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
 		scx_bpf_reenqueue_local___v2___compat();
 	else
 		scx_bpf_reenqueue_local___v1();
 }
 /*
 * Define sched_ext_ops. This may be expanded to define multiple variants for
 * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@ -151,6 +151,10 @@ static inline long scx_hotplug_seq(void)
 *
 * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
 * the current minimum required kernel version.
 *
 * COMPAT:
 * - v6.17: ops.cgroup_set_bandwidth()
 * - v6.19: ops.cgroup_set_idle()
 */
 #define SCX_OPS_OPEN(__ops_name, __scx_name) ({					\
 	struct __scx_name *__skel;						\
@ -162,6 +166,16 @@ static inline long scx_hotplug_seq(void)
 	SCX_BUG_ON(!__skel, "Could not open " #__scx_name);			\
 	__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq();		\
 	SCX_ENUM_INIT(__skel);							\
 	if (__skel->struct_ops.__ops_name->cgroup_set_bandwidth &&		\
 	    !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_bandwidth")) { \
 		fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_bandwidth()\n"); \
 		__skel->struct_ops.__ops_name->cgroup_set_bandwidth = NULL;	\
 	}									\
 	if (__skel->struct_ops.__ops_name->cgroup_set_idle &&			\
 	    !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_idle")) { \
 		fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_idle()\n"); \
 		__skel->struct_ops.__ops_name->cgroup_set_idle = NULL;	\
 	}									\
 	__skel; 								\
 })
--- a/tools/sched_ext/scx_cpu0.bpf.c
+++ b/tools/sched_ext/scx_cpu0.bpf.c
@ -0,0 +1,88 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
 * A CPU0 scheduler.
 *
 * This scheduler queues all tasks to a shared DSQ and only dispatches them on
 * CPU0 in FIFO order. This is useful for testing bypass behavior when many
 * tasks are concentrated on a single CPU. If the load balancer doesn't work,
 * bypass mode can trigger task hangs or RCU stalls as the queue is long and
 * there's only one CPU working on it.
 *
 * - Statistics tracking how many tasks are queued to local and CPU0 DSQs.
 * - Termination notification for userspace.
 *
 * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
 * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
 */
 #include <scx/common.bpf.h>
 char _license[] SEC("license") = "GPL";
 const volatile u32 nr_cpus = 32;	/* !0 for veristat, set during init */
 UEI_DEFINE(uei);
 /*
 * We create a custom DSQ with ID 0 that we dispatch to and consume from on
 * CPU0.
 */
 #define DSQ_CPU0 0
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(key_size, sizeof(u32));
 	__uint(value_size, sizeof(u64));
 	__uint(max_entries, 2);			/* [local, cpu0] */
 } stats SEC(".maps");
 static void stat_inc(u32 idx)
 {
 	u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
 	if (cnt_p)
 		(*cnt_p)++;
 }
 s32 BPF_STRUCT_OPS(cpu0_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
 	return 0;
 }
 void BPF_STRUCT_OPS(cpu0_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	/*
 	 * select_cpu() always picks CPU0. If @p is not on CPU0, it can't run on
 	 * CPU 0. Queue on whichever CPU it's currently only.
 	 */
 	if (scx_bpf_task_cpu(p) != 0) {
 		stat_inc(0);	/* count local queueing */
 		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 		return;
 	}
 	stat_inc(1);	/* count cpu0 queueing */
 	scx_bpf_dsq_insert(p, DSQ_CPU0, SCX_SLICE_DFL, enq_flags);
 }
 void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev)
 {
 	if (cpu == 0)
 		scx_bpf_dsq_move_to_local(DSQ_CPU0);
 }
 s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init)
 {
 	return scx_bpf_create_dsq(DSQ_CPU0, -1);
 }
 void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei)
 {
 	UEI_RECORD(uei, ei);
 }
 SCX_OPS_DEFINE(cpu0_ops,
 	       .select_cpu		= (void *)cpu0_select_cpu,
 	       .enqueue			= (void *)cpu0_enqueue,
 	       .dispatch		= (void *)cpu0_dispatch,
 	       .init			= (void *)cpu0_init,
 	       .exit			= (void *)cpu0_exit,
 	       .name			= "cpu0");
--- a/tools/sched_ext/scx_cpu0.c
+++ b/tools/sched_ext/scx_cpu0.c
@ -0,0 +1,106 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
 * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
 * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
 */
 #include <stdio.h>
 #include <unistd.h>
 #include <signal.h>
 #include <assert.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
 #include "scx_cpu0.bpf.skel.h"
 const char help_fmt[] =
 "A cpu0 sched_ext scheduler.\n"
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
 "Usage: %s [-v]\n"
 "\n"
 "  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";
 static bool verbose;
 static volatile int exit_req;
 static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
 {
 	if (level == LIBBPF_DEBUG && !verbose)
 		return 0;
 	return vfprintf(stderr, format, args);
 }
 static void sigint_handler(int sig)
 {
 	exit_req = 1;
 }
 static void read_stats(struct scx_cpu0 *skel, __u64 *stats)
 {
 	int nr_cpus = libbpf_num_possible_cpus();
 	assert(nr_cpus > 0);
 	__u64 cnts[2][nr_cpus];
 	__u32 idx;
 	memset(stats, 0, sizeof(stats[0]) * 2);
 	for (idx = 0; idx < 2; idx++) {
 		int ret, cpu;
 		ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
 					  &idx, cnts[idx]);
 		if (ret < 0)
 			continue;
 		for (cpu = 0; cpu < nr_cpus; cpu++)
 			stats[idx] += cnts[idx][cpu];
 	}
 }
 int main(int argc, char **argv)
 {
 	struct scx_cpu0 *skel;
 	struct bpf_link *link;
 	__u32 opt;
 	__u64 ecode;
 	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
 restart:
 	skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0);
 	skel->rodata->nr_cpus = libbpf_num_possible_cpus();
 	while ((opt = getopt(argc, argv, "vh")) != -1) {
 		switch (opt) {
 		case 'v':
 			verbose = true;
 			break;
 		default:
 			fprintf(stderr, help_fmt, basename(argv[0]));
 			return opt != 'h';
 		}
 	}
 	SCX_OPS_LOAD(skel, cpu0_ops, scx_cpu0, uei);
 	link = SCX_OPS_ATTACH(skel, cpu0_ops, scx_cpu0);
 	while (!exit_req && !UEI_EXITED(skel, uei)) {
 		__u64 stats[2];
 		read_stats(skel, stats);
 		printf("local=%llu cpu0=%llu\n", stats[0], stats[1]);
 		fflush(stdout);
 		sleep(1);
 	}
 	bpf_link__destroy(link);
 	ecode = UEI_REPORT(skel, uei);
 	scx_cpu0__destroy(skel);
 	if (UEI_ECODE_RESTART(ecode))
 		goto restart;
 	return 0;
 }
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@ -382,7 +382,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
-	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+	cgrp = scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (!cgc)
 		goto out_release;
@ -508,7 +508,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
 {
 	struct cgroup *cgrp;
-	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+	cgrp = scx_bpf_task_cgroup(p);
 	update_active_weight_sums(cgrp, true);
 	bpf_cgroup_release(cgrp);
 }
@ -521,7 +521,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
 	if (fifo_sched)
 		return;
-	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+	cgrp = scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (cgc) {
 		/*
@ -564,7 +564,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
 	if (!taskc->bypassed_at)
 		return;
-	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+	cgrp = scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (cgc) {
 		__sync_fetch_and_add(&cgc->cvtime_delta,
@ -578,7 +578,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
 {
 	struct cgroup *cgrp;
-	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
+	cgrp = scx_bpf_task_cgroup(p);
 	update_active_weight_sums(cgrp, false);
 	bpf_cgroup_release(cgrp);
 }
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@ -202,6 +202,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	void *ring;
 	s32 cpu;
 	if (enq_flags & SCX_ENQ_REENQ)
 		__sync_fetch_and_add(&nr_reenqueued, 1);
 	if (p->flags & PF_KTHREAD) {
 		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
 			return;
@ -320,12 +323,9 @@ static bool dispatch_highpri(bool from_timer)
 		if (tctx->highpri) {
 			/* exercise the set_*() and vtime interface too */
-			__COMPAT_scx_bpf_dsq_move_set_slice(
+			scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
-				BPF_FOR_EACH_ITER, slice_ns * 2);
+			scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
-			__COMPAT_scx_bpf_dsq_move_set_vtime(
+			scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
 				BPF_FOR_EACH_ITER, highpri_seq++);
 			__COMPAT_scx_bpf_dsq_move_vtime(
 				BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
 		}
 	}
@ -342,9 +342,8 @@ static bool dispatch_highpri(bool from_timer)
 		else
 			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
-		if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
+		if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
-					      SCX_DSQ_LOCAL_ON | cpu,
+				     SCX_ENQ_PREEMPT)) {
 					      SCX_ENQ_PREEMPT)) {
 			if (cpu == this_cpu) {
 				dispatched = true;
 				__sync_fetch_and_add(&nr_expedited_local, 1);
@ -533,20 +532,35 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
 	return task_qdist(a) > task_qdist(b);
 }
-void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+SEC("tp_btf/sched_switch")
 int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
 	     struct task_struct *next, unsigned long prev_state)
 {
-	u32 cnt;
+	if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
 		return 0;
 	/*
-	 * Called when @cpu is taken by a higher priority scheduling class. This
+	 * If @cpu is taken by a higher priority scheduling class, it is no
-	 * makes @cpu no longer available for executing sched_ext tasks. As we
+	 * longer available for executing sched_ext tasks. As we don't want the
-	 * don't want the tasks in @cpu's local dsq to sit there until @cpu
+	 * tasks in @cpu's local dsq to sit there until @cpu becomes available
-	 * becomes available again, re-enqueue them into the global dsq. See
+	 * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ
-	 * %SCX_ENQ_REENQ handling in qmap_enqueue().
+	 * handling in qmap_enqueue().
 	 */
-	cnt = scx_bpf_reenqueue_local();
+	switch (next->policy) {
-	if (cnt)
+	case 1: /* SCHED_FIFO */
-		__sync_fetch_and_add(&nr_reenqueued, cnt);
+	case 2: /* SCHED_RR */
 	case 6: /* SCHED_DEADLINE */
 		scx_bpf_reenqueue_local();
 	}
 	return 0;
 }
 void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
 {
 	/* see qmap_sched_switch() to learn how to do this on newer kernels */
 	if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
 		scx_bpf_reenqueue_local();
 }
 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@ -174,6 +174,7 @@ auto-test-targets :=			\
 	minimal				\
 	numa				\
 	allowed_cpus			\
 	peek_dsq			\
 	prog_run			\
 	reload_loop			\
 	select_cpu_dfl			\
--- a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
+++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
@ -0,0 +1,251 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
 * A BPF program for testing DSQ operations and peek in particular.
 *
 * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
 * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
 */
 #include <scx/common.bpf.h>
 #include <scx/compat.bpf.h>
 char _license[] SEC("license") = "GPL";
 UEI_DEFINE(uei); /* Error handling */
 #define MAX_SAMPLES 100
 #define MAX_CPUS 512
 #define DSQ_POOL_SIZE 8
 int max_samples = MAX_SAMPLES;
 int max_cpus = MAX_CPUS;
 int dsq_pool_size = DSQ_POOL_SIZE;
 /* Global variables to store test results */
 int dsq_peek_result1 = -1;
 long dsq_inserted_pid = -1;
 int insert_test_cpu = -1; /* Set to the cpu that performs the test */
 long dsq_peek_result2 = -1;
 long dsq_peek_result2_pid = -1;
 long dsq_peek_result2_expected = -1;
 int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */
 int real_dsq_id = 1235; /* DSQ for normal operation */
 int enqueue_count = -1;
 int dispatch_count = -1;
 bool debug_ksym_exists;
 /* DSQ pool for stress testing */
 int dsq_pool_base_id = 2000;
 int phase1_complete = -1;
 long total_peek_attempts = -1;
 long successful_peeks = -1;
 /* BPF map for sharing peek results with userspace */
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__uint(max_entries, MAX_SAMPLES);
 	__type(key, u32);
 	__type(value, long);
 } peek_results SEC(".maps");
 static int get_random_dsq_id(void)
 {
 	u64 time = bpf_ktime_get_ns();
 	return dsq_pool_base_id + (time % DSQ_POOL_SIZE);
 }
 static void record_peek_result(long pid)
 {
 	u32 slot_key;
 	long *slot_pid_ptr;
 	int ix;
 	if (pid <= 0)
 		return;
 	/* Find an empty slot or one with the same PID */
 	bpf_for(ix, 0, 10) {
 		slot_key = (pid + ix) % MAX_SAMPLES;
 		slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key);
 		if (!slot_pid_ptr)
 			continue;
 		if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) {
 			*slot_pid_ptr = pid;
 			break;
 		}
 	}
 }
 /* Scan all DSQs in the pool and try to move a task to local */
 static int scan_dsq_pool(void)
 {
 	struct task_struct *task;
 	int moved = 0;
 	int i;
 	bpf_for(i, 0, DSQ_POOL_SIZE) {
 		int dsq_id = dsq_pool_base_id + i;
 		total_peek_attempts++;
 		task = __COMPAT_scx_bpf_dsq_peek(dsq_id);
 		if (task) {
 			successful_peeks++;
 			record_peek_result(task->pid);
 			/* Try to move this task to local */
 			if (!moved && scx_bpf_dsq_move_to_local(dsq_id) == 0) {
 				moved = 1;
 				break;
 			}
 		}
 	}
 	return moved;
 }
 /* Struct_ops scheduler for testing DSQ peek operations */
 void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags)
 {
 	struct task_struct *peek_result;
 	int last_insert_test_cpu, cpu;
 	enqueue_count++;
 	cpu = bpf_get_smp_processor_id();
 	last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu);
 	/* Phase 1: Simple insert-then-peek test (only on first task) */
 	if (last_insert_test_cpu == -1) {
 		bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu);
 		/* Test 1: Peek empty DSQ - should return NULL */
 		peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
 		dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */
 		/* Test 2: Insert task into test DSQ for testing in dispatch callback */
 		dsq_inserted_pid = p->pid;
 		scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags);
 		dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */
 	} else if (!phase1_complete) {
 		/* Still in phase 1, use real DSQ */
 		scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags);
 	} else {
 		/* Phase 2: Random DSQ insertion for stress testing */
 		int random_dsq_id = get_random_dsq_id();
 		scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags);
 	}
 }
 void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev)
 {
 	dispatch_count++;
 	/* Phase 1: Complete the simple peek test if we inserted a task but
 	 * haven't tested peek yet
 	 */
 	if (insert_test_cpu == cpu && dsq_peek_result2 == -1) {
 		struct task_struct *peek_result;
 		bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu);
 		/* Test 3: Peek DSQ after insert - should return the task we inserted */
 		peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
 		/* Store the PID of the peeked task for comparison */
 		dsq_peek_result2 = (long)peek_result;
 		dsq_peek_result2_pid = peek_result ? peek_result->pid : -1;
 		/* Now consume the task since we've peeked at it */
 		scx_bpf_dsq_move_to_local(test_dsq_id);
 		/* Mark phase 1 as complete */
 		phase1_complete = 1;
 		bpf_printk("Phase 1 complete, starting phase 2 stress testing");
 	} else if (!phase1_complete) {
 		/* Still in phase 1, use real DSQ */
 		scx_bpf_dsq_move_to_local(real_dsq_id);
 	} else {
 		/* Phase 2: Scan all DSQs in the pool and try to move a task */
 		if (!scan_dsq_pool()) {
 			/* No tasks found in DSQ pool, fall back to real DSQ */
 			scx_bpf_dsq_move_to_local(real_dsq_id);
 		}
 	}
 }
 s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init)
 {
 	s32 err;
 	int i;
 	/* Always set debug values so we can see which version we're using */
 	debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0;
 	/* Initialize state first */
 	insert_test_cpu = -1;
 	enqueue_count = 0;
 	dispatch_count = 0;
 	phase1_complete = 0;
 	total_peek_attempts = 0;
 	successful_peeks = 0;
 	/* Create the test and real DSQs */
 	err = scx_bpf_create_dsq(test_dsq_id, -1);
 	if (err) {
 		scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
 		return err;
 	}
 	err = scx_bpf_create_dsq(real_dsq_id, -1);
 	if (err) {
 		scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
 		return err;
 	}
 	/* Create the DSQ pool for stress testing */
 	bpf_for(i, 0, DSQ_POOL_SIZE) {
 		int dsq_id = dsq_pool_base_id + i;
 		err = scx_bpf_create_dsq(dsq_id, -1);
 		if (err) {
 			scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err);
 			return err;
 		}
 	}
 	/* Initialize the peek results map */
 	bpf_for(i, 0, MAX_SAMPLES) {
 		u32 key = i;
 		long pid = -1;
 		bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY);
 	}
 	return 0;
 }
 void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei)
 {
 	int i;
 	/* Destroy the primary DSQs */
 	scx_bpf_destroy_dsq(test_dsq_id);
 	scx_bpf_destroy_dsq(real_dsq_id);
 	/* Destroy the DSQ pool */
 	bpf_for(i, 0, DSQ_POOL_SIZE) {
 		int dsq_id = dsq_pool_base_id + i;
 		scx_bpf_destroy_dsq(dsq_id);
 	}
 	UEI_RECORD(uei, ei);
 }
 SEC(".struct_ops.link")
 struct sched_ext_ops peek_dsq_ops = {
 	.enqueue = (void *)peek_dsq_enqueue,
 	.dispatch = (void *)peek_dsq_dispatch,
 	.init = (void *)peek_dsq_init,
 	.exit = (void *)peek_dsq_exit,
 	.name = "peek_dsq",
 };
--- a/tools/testing/selftests/sched_ext/peek_dsq.c
+++ b/tools/testing/selftests/sched_ext/peek_dsq.c
@ -0,0 +1,224 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
 * Test for DSQ operations including create, destroy, and peek operations.
 *
 * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
 * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
 */
 #include <bpf/bpf.h>
 #include <scx/common.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #include <pthread.h>
 #include <string.h>
 #include <sched.h>
 #include "peek_dsq.bpf.skel.h"
 #include "scx_test.h"
 #define NUM_WORKERS 4
 static bool workload_running = true;
 static pthread_t workload_threads[NUM_WORKERS];
 /**
 * Background workload thread that sleeps and wakes rapidly to exercise
 * the scheduler's enqueue operations and ensure DSQ operations get tested.
 */
 static void *workload_thread_fn(void *arg)
 {
 	while (workload_running) {
 		/* Sleep for a very short time to trigger scheduler activity */
 		usleep(1000); /* 1ms sleep */
 		/* Yield to ensure we go through the scheduler */
 		sched_yield();
 	}
 	return NULL;
 }
 static enum scx_test_status setup(void **ctx)
 {
 	struct peek_dsq *skel;
 	skel = peek_dsq__open();
 	SCX_FAIL_IF(!skel, "Failed to open");
 	SCX_ENUM_INIT(skel);
 	SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel");
 	*ctx = skel;
 	return SCX_TEST_PASS;
 }
 static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name)
 {
 	long count = 0;
 	printf("Observed %s DSQ peek pids:\n", dsq_name);
 	for (int i = 0; i < max_samples; i++) {
 		long pid;
 		int err;
 		err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid);
 		if (err == 0) {
 			if (pid == 0) {
 				printf("  Sample %d: NULL peek\n", i);
 			} else if (pid > 0) {
 				printf("  Sample %d: pid %ld\n", i, pid);
 				count++;
 			}
 		} else {
 			printf("  Sample %d: error reading pid (err=%d)\n", i, err);
 		}
 	}
 	printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name);
 	return count;
 }
 static enum scx_test_status run(void *ctx)
 {
 	struct peek_dsq *skel = ctx;
 	bool failed = false;
 	int seconds = 3;
 	int err;
 	/* Enable the scheduler to test DSQ operations */
 	printf("Enabling scheduler to test DSQ insert operations...\n");
 	struct bpf_link *link =
 		bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops);
 	if (!link) {
 		SCX_ERR("Failed to attach struct_ops");
 		return SCX_TEST_FAIL;
 	}
 	printf("Starting %d background workload threads...\n", NUM_WORKERS);
 	workload_running = true;
 	for (int i = 0; i < NUM_WORKERS; i++) {
 		err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL);
 		if (err) {
 			SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err));
 			/* Stop already created threads */
 			workload_running = false;
 			for (int j = 0; j < i; j++)
 				pthread_join(workload_threads[j], NULL);
 			bpf_link__destroy(link);
 			return SCX_TEST_FAIL;
 		}
 	}
 	printf("Waiting for enqueue events.\n");
 	sleep(seconds);
 	while (skel->data->enqueue_count <= 0) {
 		printf(".");
 		fflush(stdout);
 		sleep(1);
 		seconds++;
 		if (seconds >= 30) {
 			printf("\n\u2717 Timeout waiting for enqueue events\n");
 			/* Stop workload threads and cleanup */
 			workload_running = false;
 			for (int i = 0; i < NUM_WORKERS; i++)
 				pthread_join(workload_threads[i], NULL);
 			bpf_link__destroy(link);
 			return SCX_TEST_FAIL;
 		}
 	}
 	workload_running = false;
 	for (int i = 0; i < NUM_WORKERS; i++) {
 		err = pthread_join(workload_threads[i], NULL);
 		if (err) {
 			SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err));
 			bpf_link__destroy(link);
 			return SCX_TEST_FAIL;
 		}
 	}
 	printf("Background workload threads stopped.\n");
 	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
 	/* Detach the scheduler */
 	bpf_link__destroy(link);
 	printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds,
 		skel->data->enqueue_count, skel->data->dispatch_count);
 	printf("Debug: ksym_exists=%d\n",
 	       skel->bss->debug_ksym_exists);
 	/* Check DSQ insert result */
 	printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu);
 	if (skel->data->insert_test_cpu != -1)
 		printf("\u2713 DSQ insert succeeded !\n");
 	else {
 		printf("\u2717 DSQ insert failed or not attempted\n");
 		failed = true;
 	}
 	/* Check DSQ peek results */
 	printf("  DSQ peek result 1 (before insert): %d\n",
 	       skel->data->dsq_peek_result1);
 	if (skel->data->dsq_peek_result1 == 0)
 		printf("\u2713 DSQ peek verification success: peek returned NULL!\n");
 	else {
 		printf("\u2717 DSQ peek verification failed\n");
 		failed = true;
 	}
 	printf("  DSQ peek result 2 (after insert): %ld\n",
 	       skel->data->dsq_peek_result2);
 	printf("  DSQ peek result 2, expected: %ld\n",
 	       skel->data->dsq_peek_result2_expected);
 	if (skel->data->dsq_peek_result2 ==
 	    skel->data->dsq_peek_result2_expected)
 		printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n");
 	else {
 		printf("\u2717 DSQ peek verification failed\n");
 		failed = true;
 	}
 	printf("  Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid);
 	printf("  DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid);
 	int pid_count;
 	pid_count = print_observed_pids(skel->maps.peek_results,
 					skel->data->max_samples, "DSQ pool");
 	printf("Total non-null peek observations: %ld out of %ld\n",
 	       skel->data->successful_peeks, skel->data->total_peek_attempts);
 	if (skel->bss->debug_ksym_exists && pid_count == 0) {
 		printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n");
 		failed = true;
 	}
 	if (skel->bss->debug_ksym_exists && pid_count > 0)
 		printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n");
 	if (failed)
 		return SCX_TEST_FAIL;
 	else
 		return SCX_TEST_PASS;
 }
 static void cleanup(void *ctx)
 {
 	struct peek_dsq *skel = ctx;
 	if (workload_running) {
 		workload_running = false;
 		for (int i = 0; i < NUM_WORKERS; i++)
 			pthread_join(workload_threads[i], NULL);
 	}
 	peek_dsq__destroy(skel);
 }
 struct scx_test peek_dsq = {
 	.name = "peek_dsq",
 	.description =
 		"Test DSQ create/destroy operations and future peek functionality",
 	.setup = setup,
 	.run = run,
 	.cleanup = cleanup,
 };
 REGISTER_SCX_TEST(&peek_dsq)