sched_ext: Changes for v6.19

- Improve recovery from misbehaving BPF schedulers. When a scheduler puts many
   tasks with varying affinity restrictions on a shared DSQ, CPUs scanning
   through tasks they cannot run can overwhelm the system, causing lockups.
   Bypass mode now uses per-CPU DSQs with a load balancer to avoid this, and
   hooks into the hardlockup detector to attempt recovery. Add scx_cpu0 example
   scheduler to demonstrate this scenario.
 
 - Add lockless peek operation for DSQs to reduce lock contention for schedulers
   that need to query queue state during load balancing.
 
 - Allow scx_bpf_reenqueue_local() to be called from anywhere in preparation for
   deprecating cpu_acquire/release() callbacks in favor of generic BPF hooks.
 
 - Prepare for hierarchical scheduler support: add scx_bpf_task_set_slice() and
   scx_bpf_task_set_dsq_vtime() kfuncs, make scx_bpf_dsq_insert*() return bool,
   and wrap kfunc args in structs for future aux__prog parameter.
 
 - Implement cgroup_set_idle() callback to notify BPF schedulers when a cgroup's
   idle state changes.
 
 - Fix migration tasks being incorrectly downgraded from stop_sched_class to
   rt_sched_class across sched_ext enable/disable. Applied late as the fix is
   low risk and the bug subtle but needs stable backporting.
 
 - Various fixes and cleanups including cgroup exit ordering, SCX_KICK_WAIT
   reliability, and backward compatibility improvements.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCaS4h1A4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGe/MAP9EZ0pLiTpmMtt6mI/11Fmi+aWfL84j1zt13cz9
 W4vb4gEA9eVEH6n9xyC4nhcOk9AQwSDuCWMOzLsnhW8TbEHVTww=
 =8W/B
 -----END PGP SIGNATURE-----

Merge tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext updates from Tejun Heo:

 - Improve recovery from misbehaving BPF schedulers.

   When a scheduler puts many tasks with varying affinity restrictions
   on a shared DSQ, CPUs scanning through tasks they cannot run can
   overwhelm the system, causing lockups.

   Bypass mode now uses per-CPU DSQs with a load balancer to avoid this,
   and hooks into the hardlockup detector to attempt recovery.

   Add scx_cpu0 example scheduler to demonstrate this scenario.

 - Add lockless peek operation for DSQs to reduce lock contention for
   schedulers that need to query queue state during load balancing.

 - Allow scx_bpf_reenqueue_local() to be called from anywhere in
   preparation for deprecating cpu_acquire/release() callbacks in favor
   of generic BPF hooks.

 - Prepare for hierarchical scheduler support: add
   scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs,
   make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in
   structs for future aux__prog parameter.

 - Implement cgroup_set_idle() callback to notify BPF schedulers when a
   cgroup's idle state changes.

 - Fix migration tasks being incorrectly downgraded from
   stop_sched_class to rt_sched_class across sched_ext enable/disable.
   Applied late as the fix is low risk and the bug subtle but needs
   stable backporting.

 - Various fixes and cleanups including cgroup exit ordering,
   SCX_KICK_WAIT reliability, and backward compatibility improvements.

* tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (44 commits)
  sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks
  sched_ext: tools: Removing duplicate targets during non-cross compilation
  sched_ext: Use kvfree_rcu() to release per-cpu ksyncs object
  sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs
  sched_ext: Update comments replacing breather with aborting mechanism
  sched_ext: Implement load balancer for bypass mode
  sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked()
  sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR
  sched_ext: Add scx_cpu0 example scheduler
  sched_ext: Hook up hardlockup detector
  sched_ext: Make handle_lockup() propagate scx_verror() result
  sched_ext: Refactor lockup handlers into handle_lockup()
  sched_ext: Make scx_exit() and scx_vexit() return bool
  sched_ext: Exit dispatch and move operations immediately when aborting
  sched_ext: Simplify breather mechanism with scx_aborting flag
  sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode
  sched_ext: Refactor do_enqueue_task() local and global DSQ paths
  sched_ext: Use shorter slice in bypass mode
  sched_ext: Mark racy bitfields to prevent adding fields that can't tolerate races
  sched_ext: Minor cleanups to scx_task_iter
  ...
This commit is contained in:
Linus Torvalds 2025-12-03 13:25:39 -08:00
commit 02baaa67d9
20 changed files with 1905 additions and 423 deletions

View File

@ -17,7 +17,18 @@
enum scx_public_consts { enum scx_public_consts {
SCX_OPS_NAME_LEN = 128, SCX_OPS_NAME_LEN = 128,
/*
* %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses
* to set the slice for a task that is selected for execution.
* %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice
* refill has been triggered.
*
* %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass
* mode. As making forward progress for all tasks is the main goal of
* the bypass mode, a shorter slice is used.
*/
SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */
SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
}; };
@ -46,6 +57,7 @@ enum scx_dsq_id_flags {
SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3,
SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
}; };
@ -58,6 +70,7 @@ enum scx_dsq_id_flags {
*/ */
struct scx_dispatch_q { struct scx_dispatch_q {
raw_spinlock_t lock; raw_spinlock_t lock;
struct task_struct __rcu *first_task; /* lockless peek at head */
struct list_head list; /* tasks in dispatch order */ struct list_head list; /* tasks in dispatch order */
struct rb_root priq; /* used to order by p->scx.dsq_vtime */ struct rb_root priq; /* used to order by p->scx.dsq_vtime */
u32 nr; u32 nr;
@ -136,6 +149,13 @@ struct scx_dsq_list_node {
u32 priv; /* can be used by iter cursor */ u32 priv; /* can be used by iter cursor */
}; };
#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \
(struct scx_dsq_list_node) { \
.node = LIST_HEAD_INIT((__node).node), \
.flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \
.priv = (__priv), \
}
/* /*
* The following is embedded in task_struct and contains all fields necessary * The following is embedded in task_struct and contains all fields necessary
* for a task to be scheduled by SCX. * for a task to be scheduled by SCX.
@ -207,16 +227,18 @@ struct sched_ext_entity {
struct list_head tasks_node; struct list_head tasks_node;
}; };
void sched_ext_free(struct task_struct *p); void sched_ext_dead(struct task_struct *p);
void print_scx_info(const char *log_lvl, struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p);
void scx_softlockup(u32 dur_s); void scx_softlockup(u32 dur_s);
bool scx_hardlockup(int cpu);
bool scx_rcu_cpu_stall(void); bool scx_rcu_cpu_stall(void);
#else /* !CONFIG_SCHED_CLASS_EXT */ #else /* !CONFIG_SCHED_CLASS_EXT */
static inline void sched_ext_free(struct task_struct *p) {} static inline void sched_ext_dead(struct task_struct *p) {}
static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
static inline void scx_softlockup(u32 dur_s) {} static inline void scx_softlockup(u32 dur_s) {}
static inline bool scx_hardlockup(int cpu) { return false; }
static inline bool scx_rcu_cpu_stall(void) { return false; } static inline bool scx_rcu_cpu_stall(void) { return false; }
#endif /* CONFIG_SCHED_CLASS_EXT */ #endif /* CONFIG_SCHED_CLASS_EXT */
@ -228,6 +250,7 @@ struct scx_task_group {
u64 bw_period_us; u64 bw_period_us;
u64 bw_quota_us; u64 bw_quota_us;
u64 bw_burst_us; u64 bw_burst_us;
bool idle;
#endif #endif
}; };

View File

@ -45,6 +45,45 @@ TRACE_EVENT(sched_ext_event,
) )
); );
TRACE_EVENT(sched_ext_bypass_lb,
TP_PROTO(__u32 node, __u32 nr_cpus, __u32 nr_tasks, __u32 nr_balanced,
__u32 before_min, __u32 before_max,
__u32 after_min, __u32 after_max),
TP_ARGS(node, nr_cpus, nr_tasks, nr_balanced,
before_min, before_max, after_min, after_max),
TP_STRUCT__entry(
__field( __u32, node )
__field( __u32, nr_cpus )
__field( __u32, nr_tasks )
__field( __u32, nr_balanced )
__field( __u32, before_min )
__field( __u32, before_max )
__field( __u32, after_min )
__field( __u32, after_max )
),
TP_fast_assign(
__entry->node = node;
__entry->nr_cpus = nr_cpus;
__entry->nr_tasks = nr_tasks;
__entry->nr_balanced = nr_balanced;
__entry->before_min = before_min;
__entry->before_max = before_max;
__entry->after_min = after_min;
__entry->after_max = after_max;
),
TP_printk("node %u: nr_cpus=%u nr_tasks=%u nr_balanced=%u min=%u->%u max=%u->%u",
__entry->node, __entry->nr_cpus,
__entry->nr_tasks, __entry->nr_balanced,
__entry->before_min, __entry->after_min,
__entry->before_max, __entry->after_max
)
);
#endif /* _TRACE_SCHED_EXT_H */ #endif /* _TRACE_SCHED_EXT_H */
/* This part must be outside protection */ /* This part must be outside protection */

View File

@ -736,7 +736,6 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(tsk == current); WARN_ON(tsk == current);
unwind_task_free(tsk); unwind_task_free(tsk);
sched_ext_free(tsk);
io_uring_free(tsk); io_uring_free(tsk);
cgroup_task_free(tsk); cgroup_task_free(tsk);
task_numa_free(tsk, true); task_numa_free(tsk, true);

View File

@ -5143,6 +5143,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
if (prev->sched_class->task_dead) if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev); prev->sched_class->task_dead(prev);
/*
* sched_ext_dead() must come before cgroup_task_dead() to
* prevent cgroups from being removed while its member tasks are
* visible to SCX schedulers.
*/
sched_ext_dead(prev);
cgroup_task_dead(prev); cgroup_task_dead(prev);
/* Task is done with its stack. */ /* Task is done with its stack. */

File diff suppressed because it is too large Load Diff

View File

@ -995,26 +995,56 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
return prev_cpu; return prev_cpu;
} }
struct scx_bpf_select_cpu_and_args {
/* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */
s32 prev_cpu;
u64 wake_flags;
u64 flags;
};
/** /**
* scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p, * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask
* prioritizing those in @cpus_allowed
* @p: task_struct to select a CPU for * @p: task_struct to select a CPU for
* @prev_cpu: CPU @p was on previously
* @wake_flags: %SCX_WAKE_* flags
* @cpus_allowed: cpumask of allowed CPUs * @cpus_allowed: cpumask of allowed CPUs
* @flags: %SCX_PICK_IDLE* flags * @args: struct containing the rest of the arguments
* @args->prev_cpu: CPU @p was on previously
* @args->wake_flags: %SCX_WAKE_* flags
* @args->flags: %SCX_PICK_IDLE* flags
*
* Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
* limit. BPF programs should use scx_bpf_select_cpu_and() which is provided
* as an inline wrapper in common.bpf.h.
* *
* Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
* context such as a BPF test_run() call, as long as built-in CPU selection * context such as a BPF test_run() call, as long as built-in CPU selection
* is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
* is set. * is set.
* *
* @p, @prev_cpu and @wake_flags match ops.select_cpu(). * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu().
* *
* Returns the selected idle CPU, which will be automatically awakened upon * Returns the selected idle CPU, which will be automatically awakened upon
* returning from ops.select_cpu() and can be used for direct dispatch, or * returning from ops.select_cpu() and can be used for direct dispatch, or
* a negative value if no idle CPU is available. * a negative value if no idle CPU is available.
*/ */
__bpf_kfunc s32
__scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
struct scx_bpf_select_cpu_and_args *args)
{
struct scx_sched *sch;
guard(rcu)();
sch = rcu_dereference(scx_root);
if (unlikely(!sch))
return -ENODEV;
return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags,
cpus_allowed, args->flags);
}
/*
* COMPAT: Will be removed in v6.22.
*/
__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags) const struct cpumask *cpus_allowed, u64 flags)
{ {
@ -1383,6 +1413,7 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_idle) BTF_KFUNCS_END(scx_kfunc_ids_idle)

View File

@ -23,6 +23,11 @@ enum scx_consts {
* scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
*/ */
SCX_TASK_ITER_BATCH = 32, SCX_TASK_ITER_BATCH = 32,
SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
SCX_BYPASS_LB_DONOR_PCT = 125,
SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
SCX_BYPASS_LB_BATCH = 256,
}; };
enum scx_exit_kind { enum scx_exit_kind {
@ -697,12 +702,23 @@ struct sched_ext_ops {
* 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
* interpreted in the same fashion and specifies how much @cgrp can * interpreted in the same fashion and specifies how much @cgrp can
* burst temporarily. The specific control mechanism and thus the * burst temporarily. The specific control mechanism and thus the
* interpretation of @period_us and burstiness is upto to the BPF * interpretation of @period_us and burstiness is up to the BPF
* scheduler. * scheduler.
*/ */
void (*cgroup_set_bandwidth)(struct cgroup *cgrp, void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
u64 period_us, u64 quota_us, u64 burst_us); u64 period_us, u64 quota_us, u64 burst_us);
/**
* @cgroup_set_idle: A cgroup's idle state is being changed
* @cgrp: cgroup whose idle state is being updated
* @idle: whether the cgroup is entering or exiting idle state
*
* Update @cgrp's idle state to @idle. This callback is invoked when
* a cgroup transitions between idle and non-idle states, allowing the
* BPF scheduler to adjust its behavior accordingly.
*/
void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle);
#endif /* CONFIG_EXT_GROUP_SCHED */ #endif /* CONFIG_EXT_GROUP_SCHED */
/* /*
@ -884,6 +900,10 @@ struct scx_sched {
struct scx_dispatch_q **global_dsqs; struct scx_dispatch_q **global_dsqs;
struct scx_sched_pcpu __percpu *pcpu; struct scx_sched_pcpu __percpu *pcpu;
/*
* Updates to the following warned bitfields can race causing RMW issues
* but it doesn't really matter.
*/
bool warned_zero_slice:1; bool warned_zero_slice:1;
bool warned_deprecated_rq:1; bool warned_deprecated_rq:1;
@ -948,6 +968,7 @@ enum scx_enq_flags {
SCX_ENQ_CLEAR_OPSS = 1LLU << 56, SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
SCX_ENQ_DSQ_PRIQ = 1LLU << 57, SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
SCX_ENQ_NESTED = 1LLU << 58,
}; };
enum scx_deq_flags { enum scx_deq_flags {
@ -986,8 +1007,10 @@ enum scx_kick_flags {
SCX_KICK_PREEMPT = 1LLU << 1, SCX_KICK_PREEMPT = 1LLU << 1,
/* /*
* Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will * The scx_bpf_kick_cpu() call will return after the current SCX task of
* return after the target CPU finishes picking the next task. * the target CPU switches out. This can be used to implement e.g. core
* scheduling. This has no effect if the current task on the target CPU
* is not on SCX.
*/ */
SCX_KICK_WAIT = 1LLU << 2, SCX_KICK_WAIT = 1LLU << 2,
}; };

View File

@ -803,10 +803,12 @@ struct scx_rq {
cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_preempt;
cpumask_var_t cpus_to_wait; cpumask_var_t cpus_to_wait;
unsigned long pnt_seq; unsigned long kick_sync;
local_t reenq_local_deferred;
struct balance_callback deferred_bal_cb; struct balance_callback deferred_bal_cb;
struct irq_work deferred_irq_work; struct irq_work deferred_irq_work;
struct irq_work kick_cpus_irq_work; struct irq_work kick_cpus_irq_work;
struct scx_dispatch_q bypass_dsq;
}; };
#endif /* CONFIG_SCHED_CLASS_EXT */ #endif /* CONFIG_SCHED_CLASS_EXT */

View File

@ -196,6 +196,15 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
#ifdef CONFIG_SYSFS #ifdef CONFIG_SYSFS
++hardlockup_count; ++hardlockup_count;
#endif #endif
/*
* A poorly behaving BPF scheduler can trigger hard lockup by
* e.g. putting numerous affinitized tasks in a single queue and
* directing all CPUs at it. The following call can return true
* only once when sched_ext is enabled and will immediately
* abort the BPF scheduler and print out a warning message.
*/
if (scx_hardlockup(cpu))
return;
/* Only print hardlockups once. */ /* Only print hardlockups once. */
if (per_cpu(watchdog_hardlockup_warned, cpu)) if (per_cpu(watchdog_hardlockup_warned, cpu))

View File

@ -133,6 +133,7 @@ $(MAKE_DIRS):
$(call msg,MKDIR,,$@) $(call msg,MKDIR,,$@)
$(Q)mkdir -p $@ $(Q)mkdir -p $@
ifneq ($(CROSS_COMPILE),)
$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
$(APIDIR)/linux/bpf.h \ $(APIDIR)/linux/bpf.h \
| $(OBJ_DIR)/libbpf | $(OBJ_DIR)/libbpf
@ -141,6 +142,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
EXTRA_CFLAGS='-g -O0 -fPIC' \ EXTRA_CFLAGS='-g -O0 -fPIC' \
LDFLAGS="$(LDFLAGS)" \ LDFLAGS="$(LDFLAGS)" \
DESTDIR=$(OUTPUT_DIR) prefix= all install_headers DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
endif
$(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
$(APIDIR)/linux/bpf.h \ $(APIDIR)/linux/bpf.h \
@ -187,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg
$(addprefix $(BINDIR)/,$(c-sched-targets)): \ $(addprefix $(BINDIR)/,$(c-sched-targets)): \
$(BINDIR)/%: \ $(BINDIR)/%: \

View File

@ -60,21 +60,15 @@ static inline void ___vmlinux_h_sanity_check___(void)
s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
const struct cpumask *cpus_allowed, u64 flags) __ksym __weak; struct scx_bpf_select_cpu_and_args *args) __ksym __weak;
void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak;
void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
u32 scx_bpf_dispatch_nr_slots(void) __ksym; u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch_cancel(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym;
bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak;
void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
u32 scx_bpf_reenqueue_local(void) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
@ -105,7 +99,6 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
struct rq *scx_bpf_locked_rq(void) __ksym; struct rq *scx_bpf_locked_rq(void) __ksym;
struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
u64 scx_bpf_now(void) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak;
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;

View File

@ -16,119 +16,92 @@
}) })
/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ /* v6.12: 819513666966 ("sched_ext: Add cgroup support") */
#define __COMPAT_scx_bpf_task_cgroup(p) \ struct cgroup *scx_bpf_task_cgroup___new(struct task_struct *p) __ksym __weak;
(bpf_ksym_exists(scx_bpf_task_cgroup) ? \
scx_bpf_task_cgroup((p)) : NULL) #define scx_bpf_task_cgroup(p) \
(bpf_ksym_exists(scx_bpf_task_cgroup___new) ? \
scx_bpf_task_cgroup___new((p)) : NULL)
/* /*
* v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are * v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are
* renamed to unload the verb. * renamed to unload the verb.
* *
* Build error is triggered if old names are used. New binaries work with both
* new and old names. The compat macros will be removed on v6.15 release.
*
* scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by * scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by
* 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()"). * 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()").
* Preserve __COMPAT macros until v6.15.
*/ */
void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; bool scx_bpf_dsq_move_to_local___new(u64 dsq_id) __ksym __weak;
void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; void scx_bpf_dsq_move_set_slice___new(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak; void scx_bpf_dsq_move_set_vtime___new(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; bool scx_bpf_dsq_move___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dsq_move_vtime___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ bool scx_bpf_consume___old(u64 dsq_id) __ksym __weak;
(bpf_ksym_exists(scx_bpf_dsq_insert) ? \ void scx_bpf_dispatch_from_dsq_set_slice___old(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) : \ void scx_bpf_dispatch_from_dsq_set_vtime___old(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags))) bool scx_bpf_dispatch_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
bool scx_bpf_dispatch_vtime_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
#define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags) \
(bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ? \
scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \
scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags)))
#define scx_bpf_dsq_move_to_local(dsq_id) \ #define scx_bpf_dsq_move_to_local(dsq_id) \
(bpf_ksym_exists(scx_bpf_dsq_move_to_local) ? \ (bpf_ksym_exists(scx_bpf_dsq_move_to_local___new) ? \
scx_bpf_dsq_move_to_local((dsq_id)) : \ scx_bpf_dsq_move_to_local___new((dsq_id)) : \
scx_bpf_consume___compat((dsq_id))) scx_bpf_consume___old((dsq_id)))
#define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice) \ #define scx_bpf_dsq_move_set_slice(it__iter, slice) \
(bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ? \ (bpf_ksym_exists(scx_bpf_dsq_move_set_slice___new) ? \
scx_bpf_dsq_move_set_slice((it__iter), (slice)) : \ scx_bpf_dsq_move_set_slice___new((it__iter), (slice)) : \
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ? \ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___old) ? \
scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) : \ scx_bpf_dispatch_from_dsq_set_slice___old((it__iter), (slice)) : \
(void)0)) (void)0))
#define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime) \ #define scx_bpf_dsq_move_set_vtime(it__iter, vtime) \
(bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ? \ (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime___new) ? \
scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) : \ scx_bpf_dsq_move_set_vtime___new((it__iter), (vtime)) : \
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ? \ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___old) ? \
scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) : \ scx_bpf_dispatch_from_dsq_set_vtime___old((it__iter), (vtime)) : \
(void) 0)) (void)0))
#define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \ #define scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \
(bpf_ksym_exists(scx_bpf_dsq_move) ? \ (bpf_ksym_exists(scx_bpf_dsq_move___new) ? \
scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) : \ scx_bpf_dsq_move___new((it__iter), (p), (dsq_id), (enq_flags)) : \
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ? \ (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___old) ? \
scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ scx_bpf_dispatch_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \
false)) false))
#define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \ #define scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \
(bpf_ksym_exists(scx_bpf_dsq_move_vtime) ? \ (bpf_ksym_exists(scx_bpf_dsq_move_vtime___new) ? \
scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \ scx_bpf_dsq_move_vtime___new((it__iter), (p), (dsq_id), (enq_flags)) : \
(bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ? \ (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___old) ? \
scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ scx_bpf_dispatch_vtime_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \
false)) false))
/*
* v6.15: 950ad93df2fc ("bpf: add kfunc for populating cpumask bits")
*
* Compat macro will be dropped on v6.19 release.
*/
int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ #define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \
(bpf_ksym_exists(bpf_cpumask_populate) ? \ (bpf_ksym_exists(bpf_cpumask_populate) ? \
(bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP)
#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ /*
_Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") * v6.19: Introduce lockless peek API for user DSQs.
*
* Preserve the following macro until v6.21.
*/
static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id)
{
struct task_struct *p = NULL;
struct bpf_iter_scx_dsq it;
#define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags) \ if (bpf_ksym_exists(scx_bpf_dsq_peek))
_Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()") return scx_bpf_dsq_peek(dsq_id);
if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0))
#define scx_bpf_consume(dsq_id) ({ \ p = bpf_iter_scx_dsq_next(&it);
_Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \ bpf_iter_scx_dsq_destroy(&it);
false; \ return p;
}) }
#define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \
_Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()")
#define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \
_Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()")
#define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
_Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \
false; \
})
#define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
_Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \
false; \
})
#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \
_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()")
#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \
_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()")
#define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \
false; \
})
#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
_Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \
false; \
})
/** /**
* __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
@ -247,6 +220,161 @@ static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu)
return rq ? rq->curr : NULL; return rq ? rq->curr : NULL;
} }
/*
* v6.19: To work around BPF maximum parameter limit, the following kfuncs are
* replaced with variants that pack scalar arguments in a struct. Wrappers are
* provided to maintain source compatibility.
*
* v6.13: scx_bpf_dsq_insert_vtime() renaming is also handled here. See the
* block on dispatch renaming above for more details.
*
* The kernel will carry the compat variants until v6.23 to maintain binary
* compatibility. After v6.23 release, remove the compat handling and move the
* wrappers to common.bpf.h.
*/
s32 scx_bpf_select_cpu_and___compat(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
void scx_bpf_dsq_insert_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
/**
* scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p
* @p: task_struct to select a CPU for
* @prev_cpu: CPU @p was on previously
* @wake_flags: %SCX_WAKE_* flags
* @cpus_allowed: cpumask of allowed CPUs
* @flags: %SCX_PICK_IDLE* flags
*
* Inline wrapper that packs scalar arguments into a struct and calls
* __scx_bpf_select_cpu_and(). See __scx_bpf_select_cpu_and() for details.
*/
static inline s32
scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags)
{
if (bpf_core_type_exists(struct scx_bpf_select_cpu_and_args)) {
struct scx_bpf_select_cpu_and_args args = {
.prev_cpu = prev_cpu,
.wake_flags = wake_flags,
.flags = flags,
};
return __scx_bpf_select_cpu_and(p, cpus_allowed, &args);
} else {
return scx_bpf_select_cpu_and___compat(p, prev_cpu, wake_flags,
cpus_allowed, flags);
}
}
/**
* scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
* @p: task_struct to insert
* @dsq_id: DSQ to insert into
* @slice: duration @p can run for in nsecs, 0 to keep the current value
* @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
* @enq_flags: SCX_ENQ_*
*
* Inline wrapper that packs scalar arguments into a struct and calls
* __scx_bpf_dsq_insert_vtime(). See __scx_bpf_dsq_insert_vtime() for details.
*/
static inline bool
scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime,
u64 enq_flags)
{
if (bpf_core_type_exists(struct scx_bpf_dsq_insert_vtime_args)) {
struct scx_bpf_dsq_insert_vtime_args args = {
.dsq_id = dsq_id,
.slice = slice,
.vtime = vtime,
.enq_flags = enq_flags,
};
return __scx_bpf_dsq_insert_vtime(p, &args);
} else if (bpf_ksym_exists(scx_bpf_dsq_insert_vtime___compat)) {
scx_bpf_dsq_insert_vtime___compat(p, dsq_id, slice, vtime,
enq_flags);
return true;
} else {
scx_bpf_dispatch_vtime___compat(p, dsq_id, slice, vtime,
enq_flags);
return true;
}
}
/*
* v6.19: scx_bpf_dsq_insert() now returns bool instead of void. Move
* scx_bpf_dsq_insert() decl to common.bpf.h and drop compat helper after v6.22.
* The extra ___compat suffix is to work around libbpf not ignoring __SUFFIX on
* kernel side. The entire suffix can be dropped later.
*
* v6.13: scx_bpf_dsq_insert() renaming is also handled here. See the block on
* dispatch renaming above for more details.
*/
bool scx_bpf_dsq_insert___v2___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
void scx_bpf_dsq_insert___v1(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
static inline bool
scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags)
{
if (bpf_ksym_exists(scx_bpf_dsq_insert___v2___compat)) {
return scx_bpf_dsq_insert___v2___compat(p, dsq_id, slice, enq_flags);
} else if (bpf_ksym_exists(scx_bpf_dsq_insert___v1)) {
scx_bpf_dsq_insert___v1(p, dsq_id, slice, enq_flags);
return true;
} else {
scx_bpf_dispatch___compat(p, dsq_id, slice, enq_flags);
return true;
}
}
/*
* v6.19: scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() added to for
* sub-sched authority checks. Drop the wrappers and move the decls to
* common.bpf.h after v6.22.
*/
bool scx_bpf_task_set_slice___new(struct task_struct *p, u64 slice) __ksym __weak;
bool scx_bpf_task_set_dsq_vtime___new(struct task_struct *p, u64 vtime) __ksym __weak;
static inline void scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
{
if (bpf_ksym_exists(scx_bpf_task_set_slice___new))
scx_bpf_task_set_slice___new(p, slice);
else
p->scx.slice = slice;
}
static inline void scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
{
if (bpf_ksym_exists(scx_bpf_task_set_dsq_vtime___new))
scx_bpf_task_set_dsq_vtime___new(p, vtime);
else
p->scx.dsq_vtime = vtime;
}
/*
* v6.19: The new void variant can be called from anywhere while the older v1
* variant can only be called from ops.cpu_release(). The double ___ prefixes on
* the v2 variant need to be removed once libbpf is updated to ignore ___ prefix
* on kernel side. Drop the wrapper and move the decl to common.bpf.h after
* v6.22.
*/
u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak;
void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak;
static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void)
{
return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat);
}
static inline void scx_bpf_reenqueue_local(void)
{
if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
scx_bpf_reenqueue_local___v2___compat();
else
scx_bpf_reenqueue_local___v1();
}
/* /*
* Define sched_ext_ops. This may be expanded to define multiple variants for * Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().

View File

@ -151,6 +151,10 @@ static inline long scx_hotplug_seq(void)
* *
* ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
* the current minimum required kernel version. * the current minimum required kernel version.
*
* COMPAT:
* - v6.17: ops.cgroup_set_bandwidth()
* - v6.19: ops.cgroup_set_idle()
*/ */
#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ #define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
struct __scx_name *__skel; \ struct __scx_name *__skel; \
@ -162,6 +166,16 @@ static inline long scx_hotplug_seq(void)
SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \
SCX_ENUM_INIT(__skel); \ SCX_ENUM_INIT(__skel); \
if (__skel->struct_ops.__ops_name->cgroup_set_bandwidth && \
!__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_bandwidth")) { \
fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_bandwidth()\n"); \
__skel->struct_ops.__ops_name->cgroup_set_bandwidth = NULL; \
} \
if (__skel->struct_ops.__ops_name->cgroup_set_idle && \
!__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_idle")) { \
fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_idle()\n"); \
__skel->struct_ops.__ops_name->cgroup_set_idle = NULL; \
} \
__skel; \ __skel; \
}) })

View File

@ -0,0 +1,88 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* A CPU0 scheduler.
*
* This scheduler queues all tasks to a shared DSQ and only dispatches them on
* CPU0 in FIFO order. This is useful for testing bypass behavior when many
* tasks are concentrated on a single CPU. If the load balancer doesn't work,
* bypass mode can trigger task hangs or RCU stalls as the queue is long and
* there's only one CPU working on it.
*
* - Statistics tracking how many tasks are queued to local and CPU0 DSQs.
* - Termination notification for userspace.
*
* Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2025 Tejun Heo <tj@kernel.org>
*/
#include <scx/common.bpf.h>
char _license[] SEC("license") = "GPL";
const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */
UEI_DEFINE(uei);
/*
* We create a custom DSQ with ID 0 that we dispatch to and consume from on
* CPU0.
*/
#define DSQ_CPU0 0
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u64));
__uint(max_entries, 2); /* [local, cpu0] */
} stats SEC(".maps");
static void stat_inc(u32 idx)
{
u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
if (cnt_p)
(*cnt_p)++;
}
s32 BPF_STRUCT_OPS(cpu0_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
return 0;
}
void BPF_STRUCT_OPS(cpu0_enqueue, struct task_struct *p, u64 enq_flags)
{
/*
* select_cpu() always picks CPU0. If @p is not on CPU0, it can't run on
* CPU 0. Queue on whichever CPU it's currently only.
*/
if (scx_bpf_task_cpu(p) != 0) {
stat_inc(0); /* count local queueing */
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
return;
}
stat_inc(1); /* count cpu0 queueing */
scx_bpf_dsq_insert(p, DSQ_CPU0, SCX_SLICE_DFL, enq_flags);
}
void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev)
{
if (cpu == 0)
scx_bpf_dsq_move_to_local(DSQ_CPU0);
}
s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init)
{
return scx_bpf_create_dsq(DSQ_CPU0, -1);
}
void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei)
{
UEI_RECORD(uei, ei);
}
SCX_OPS_DEFINE(cpu0_ops,
.select_cpu = (void *)cpu0_select_cpu,
.enqueue = (void *)cpu0_enqueue,
.dispatch = (void *)cpu0_dispatch,
.init = (void *)cpu0_init,
.exit = (void *)cpu0_exit,
.name = "cpu0");

106
tools/sched_ext/scx_cpu0.c Normal file
View File

@ -0,0 +1,106 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2025 Tejun Heo <tj@kernel.org>
*/
#include <stdio.h>
#include <unistd.h>
#include <signal.h>
#include <assert.h>
#include <libgen.h>
#include <bpf/bpf.h>
#include <scx/common.h>
#include "scx_cpu0.bpf.skel.h"
const char help_fmt[] =
"A cpu0 sched_ext scheduler.\n"
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-v]\n"
"\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
static bool verbose;
static volatile int exit_req;
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
{
if (level == LIBBPF_DEBUG && !verbose)
return 0;
return vfprintf(stderr, format, args);
}
static void sigint_handler(int sig)
{
exit_req = 1;
}
static void read_stats(struct scx_cpu0 *skel, __u64 *stats)
{
int nr_cpus = libbpf_num_possible_cpus();
assert(nr_cpus > 0);
__u64 cnts[2][nr_cpus];
__u32 idx;
memset(stats, 0, sizeof(stats[0]) * 2);
for (idx = 0; idx < 2; idx++) {
int ret, cpu;
ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
&idx, cnts[idx]);
if (ret < 0)
continue;
for (cpu = 0; cpu < nr_cpus; cpu++)
stats[idx] += cnts[idx][cpu];
}
}
int main(int argc, char **argv)
{
struct scx_cpu0 *skel;
struct bpf_link *link;
__u32 opt;
__u64 ecode;
libbpf_set_print(libbpf_print_fn);
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
restart:
skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0);
skel->rodata->nr_cpus = libbpf_num_possible_cpus();
while ((opt = getopt(argc, argv, "vh")) != -1) {
switch (opt) {
case 'v':
verbose = true;
break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';
}
}
SCX_OPS_LOAD(skel, cpu0_ops, scx_cpu0, uei);
link = SCX_OPS_ATTACH(skel, cpu0_ops, scx_cpu0);
while (!exit_req && !UEI_EXITED(skel, uei)) {
__u64 stats[2];
read_stats(skel, stats);
printf("local=%llu cpu0=%llu\n", stats[0], stats[1]);
fflush(stdout);
sleep(1);
}
bpf_link__destroy(link);
ecode = UEI_REPORT(skel, uei);
scx_cpu0__destroy(skel);
if (UEI_ECODE_RESTART(ecode))
goto restart;
return 0;
}

View File

@ -382,7 +382,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
return; return;
} }
cgrp = __COMPAT_scx_bpf_task_cgroup(p); cgrp = scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp); cgc = find_cgrp_ctx(cgrp);
if (!cgc) if (!cgc)
goto out_release; goto out_release;
@ -508,7 +508,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
{ {
struct cgroup *cgrp; struct cgroup *cgrp;
cgrp = __COMPAT_scx_bpf_task_cgroup(p); cgrp = scx_bpf_task_cgroup(p);
update_active_weight_sums(cgrp, true); update_active_weight_sums(cgrp, true);
bpf_cgroup_release(cgrp); bpf_cgroup_release(cgrp);
} }
@ -521,7 +521,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
if (fifo_sched) if (fifo_sched)
return; return;
cgrp = __COMPAT_scx_bpf_task_cgroup(p); cgrp = scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp); cgc = find_cgrp_ctx(cgrp);
if (cgc) { if (cgc) {
/* /*
@ -564,7 +564,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
if (!taskc->bypassed_at) if (!taskc->bypassed_at)
return; return;
cgrp = __COMPAT_scx_bpf_task_cgroup(p); cgrp = scx_bpf_task_cgroup(p);
cgc = find_cgrp_ctx(cgrp); cgc = find_cgrp_ctx(cgrp);
if (cgc) { if (cgc) {
__sync_fetch_and_add(&cgc->cvtime_delta, __sync_fetch_and_add(&cgc->cvtime_delta,
@ -578,7 +578,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
{ {
struct cgroup *cgrp; struct cgroup *cgrp;
cgrp = __COMPAT_scx_bpf_task_cgroup(p); cgrp = scx_bpf_task_cgroup(p);
update_active_weight_sums(cgrp, false); update_active_weight_sums(cgrp, false);
bpf_cgroup_release(cgrp); bpf_cgroup_release(cgrp);
} }

View File

@ -202,6 +202,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
void *ring; void *ring;
s32 cpu; s32 cpu;
if (enq_flags & SCX_ENQ_REENQ)
__sync_fetch_and_add(&nr_reenqueued, 1);
if (p->flags & PF_KTHREAD) { if (p->flags & PF_KTHREAD) {
if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
return; return;
@ -320,12 +323,9 @@ static bool dispatch_highpri(bool from_timer)
if (tctx->highpri) { if (tctx->highpri) {
/* exercise the set_*() and vtime interface too */ /* exercise the set_*() and vtime interface too */
__COMPAT_scx_bpf_dsq_move_set_slice( scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
BPF_FOR_EACH_ITER, slice_ns * 2); scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
__COMPAT_scx_bpf_dsq_move_set_vtime( scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
BPF_FOR_EACH_ITER, highpri_seq++);
__COMPAT_scx_bpf_dsq_move_vtime(
BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
} }
} }
@ -342,9 +342,8 @@ static bool dispatch_highpri(bool from_timer)
else else
cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
SCX_DSQ_LOCAL_ON | cpu, SCX_ENQ_PREEMPT)) {
SCX_ENQ_PREEMPT)) {
if (cpu == this_cpu) { if (cpu == this_cpu) {
dispatched = true; dispatched = true;
__sync_fetch_and_add(&nr_expedited_local, 1); __sync_fetch_and_add(&nr_expedited_local, 1);
@ -533,20 +532,35 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
return task_qdist(a) > task_qdist(b); return task_qdist(a) > task_qdist(b);
} }
void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) SEC("tp_btf/sched_switch")
int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
struct task_struct *next, unsigned long prev_state)
{ {
u32 cnt; if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
return 0;
/* /*
* Called when @cpu is taken by a higher priority scheduling class. This * If @cpu is taken by a higher priority scheduling class, it is no
* makes @cpu no longer available for executing sched_ext tasks. As we * longer available for executing sched_ext tasks. As we don't want the
* don't want the tasks in @cpu's local dsq to sit there until @cpu * tasks in @cpu's local dsq to sit there until @cpu becomes available
* becomes available again, re-enqueue them into the global dsq. See * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ
* %SCX_ENQ_REENQ handling in qmap_enqueue(). * handling in qmap_enqueue().
*/ */
cnt = scx_bpf_reenqueue_local(); switch (next->policy) {
if (cnt) case 1: /* SCHED_FIFO */
__sync_fetch_and_add(&nr_reenqueued, cnt); case 2: /* SCHED_RR */
case 6: /* SCHED_DEADLINE */
scx_bpf_reenqueue_local();
}
return 0;
}
void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
{
/* see qmap_sched_switch() to learn how to do this on newer kernels */
if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
scx_bpf_reenqueue_local();
} }
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,

View File

@ -174,6 +174,7 @@ auto-test-targets := \
minimal \ minimal \
numa \ numa \
allowed_cpus \ allowed_cpus \
peek_dsq \
prog_run \ prog_run \
reload_loop \ reload_loop \
select_cpu_dfl \ select_cpu_dfl \

View File

@ -0,0 +1,251 @@
// SPDX-License-Identifier: GPL-2.0
/*
* A BPF program for testing DSQ operations and peek in particular.
*
* Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
*/
#include <scx/common.bpf.h>
#include <scx/compat.bpf.h>
char _license[] SEC("license") = "GPL";
UEI_DEFINE(uei); /* Error handling */
#define MAX_SAMPLES 100
#define MAX_CPUS 512
#define DSQ_POOL_SIZE 8
int max_samples = MAX_SAMPLES;
int max_cpus = MAX_CPUS;
int dsq_pool_size = DSQ_POOL_SIZE;
/* Global variables to store test results */
int dsq_peek_result1 = -1;
long dsq_inserted_pid = -1;
int insert_test_cpu = -1; /* Set to the cpu that performs the test */
long dsq_peek_result2 = -1;
long dsq_peek_result2_pid = -1;
long dsq_peek_result2_expected = -1;
int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */
int real_dsq_id = 1235; /* DSQ for normal operation */
int enqueue_count = -1;
int dispatch_count = -1;
bool debug_ksym_exists;
/* DSQ pool for stress testing */
int dsq_pool_base_id = 2000;
int phase1_complete = -1;
long total_peek_attempts = -1;
long successful_peeks = -1;
/* BPF map for sharing peek results with userspace */
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, MAX_SAMPLES);
__type(key, u32);
__type(value, long);
} peek_results SEC(".maps");
static int get_random_dsq_id(void)
{
u64 time = bpf_ktime_get_ns();
return dsq_pool_base_id + (time % DSQ_POOL_SIZE);
}
static void record_peek_result(long pid)
{
u32 slot_key;
long *slot_pid_ptr;
int ix;
if (pid <= 0)
return;
/* Find an empty slot or one with the same PID */
bpf_for(ix, 0, 10) {
slot_key = (pid + ix) % MAX_SAMPLES;
slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key);
if (!slot_pid_ptr)
continue;
if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) {
*slot_pid_ptr = pid;
break;
}
}
}
/* Scan all DSQs in the pool and try to move a task to local */
static int scan_dsq_pool(void)
{
struct task_struct *task;
int moved = 0;
int i;
bpf_for(i, 0, DSQ_POOL_SIZE) {
int dsq_id = dsq_pool_base_id + i;
total_peek_attempts++;
task = __COMPAT_scx_bpf_dsq_peek(dsq_id);
if (task) {
successful_peeks++;
record_peek_result(task->pid);
/* Try to move this task to local */
if (!moved && scx_bpf_dsq_move_to_local(dsq_id) == 0) {
moved = 1;
break;
}
}
}
return moved;
}
/* Struct_ops scheduler for testing DSQ peek operations */
void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags)
{
struct task_struct *peek_result;
int last_insert_test_cpu, cpu;
enqueue_count++;
cpu = bpf_get_smp_processor_id();
last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu);
/* Phase 1: Simple insert-then-peek test (only on first task) */
if (last_insert_test_cpu == -1) {
bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu);
/* Test 1: Peek empty DSQ - should return NULL */
peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */
/* Test 2: Insert task into test DSQ for testing in dispatch callback */
dsq_inserted_pid = p->pid;
scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags);
dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */
} else if (!phase1_complete) {
/* Still in phase 1, use real DSQ */
scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags);
} else {
/* Phase 2: Random DSQ insertion for stress testing */
int random_dsq_id = get_random_dsq_id();
scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags);
}
}
void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev)
{
dispatch_count++;
/* Phase 1: Complete the simple peek test if we inserted a task but
* haven't tested peek yet
*/
if (insert_test_cpu == cpu && dsq_peek_result2 == -1) {
struct task_struct *peek_result;
bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu);
/* Test 3: Peek DSQ after insert - should return the task we inserted */
peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
/* Store the PID of the peeked task for comparison */
dsq_peek_result2 = (long)peek_result;
dsq_peek_result2_pid = peek_result ? peek_result->pid : -1;
/* Now consume the task since we've peeked at it */
scx_bpf_dsq_move_to_local(test_dsq_id);
/* Mark phase 1 as complete */
phase1_complete = 1;
bpf_printk("Phase 1 complete, starting phase 2 stress testing");
} else if (!phase1_complete) {
/* Still in phase 1, use real DSQ */
scx_bpf_dsq_move_to_local(real_dsq_id);
} else {
/* Phase 2: Scan all DSQs in the pool and try to move a task */
if (!scan_dsq_pool()) {
/* No tasks found in DSQ pool, fall back to real DSQ */
scx_bpf_dsq_move_to_local(real_dsq_id);
}
}
}
s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init)
{
s32 err;
int i;
/* Always set debug values so we can see which version we're using */
debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0;
/* Initialize state first */
insert_test_cpu = -1;
enqueue_count = 0;
dispatch_count = 0;
phase1_complete = 0;
total_peek_attempts = 0;
successful_peeks = 0;
/* Create the test and real DSQs */
err = scx_bpf_create_dsq(test_dsq_id, -1);
if (err) {
scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
return err;
}
err = scx_bpf_create_dsq(real_dsq_id, -1);
if (err) {
scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
return err;
}
/* Create the DSQ pool for stress testing */
bpf_for(i, 0, DSQ_POOL_SIZE) {
int dsq_id = dsq_pool_base_id + i;
err = scx_bpf_create_dsq(dsq_id, -1);
if (err) {
scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err);
return err;
}
}
/* Initialize the peek results map */
bpf_for(i, 0, MAX_SAMPLES) {
u32 key = i;
long pid = -1;
bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY);
}
return 0;
}
void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei)
{
int i;
/* Destroy the primary DSQs */
scx_bpf_destroy_dsq(test_dsq_id);
scx_bpf_destroy_dsq(real_dsq_id);
/* Destroy the DSQ pool */
bpf_for(i, 0, DSQ_POOL_SIZE) {
int dsq_id = dsq_pool_base_id + i;
scx_bpf_destroy_dsq(dsq_id);
}
UEI_RECORD(uei, ei);
}
SEC(".struct_ops.link")
struct sched_ext_ops peek_dsq_ops = {
.enqueue = (void *)peek_dsq_enqueue,
.dispatch = (void *)peek_dsq_dispatch,
.init = (void *)peek_dsq_init,
.exit = (void *)peek_dsq_exit,
.name = "peek_dsq",
};

View File

@ -0,0 +1,224 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Test for DSQ operations including create, destroy, and peek operations.
*
* Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
*/
#include <bpf/bpf.h>
#include <scx/common.h>
#include <sys/wait.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <sched.h>
#include "peek_dsq.bpf.skel.h"
#include "scx_test.h"
#define NUM_WORKERS 4
static bool workload_running = true;
static pthread_t workload_threads[NUM_WORKERS];
/**
* Background workload thread that sleeps and wakes rapidly to exercise
* the scheduler's enqueue operations and ensure DSQ operations get tested.
*/
static void *workload_thread_fn(void *arg)
{
while (workload_running) {
/* Sleep for a very short time to trigger scheduler activity */
usleep(1000); /* 1ms sleep */
/* Yield to ensure we go through the scheduler */
sched_yield();
}
return NULL;
}
static enum scx_test_status setup(void **ctx)
{
struct peek_dsq *skel;
skel = peek_dsq__open();
SCX_FAIL_IF(!skel, "Failed to open");
SCX_ENUM_INIT(skel);
SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel");
*ctx = skel;
return SCX_TEST_PASS;
}
static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name)
{
long count = 0;
printf("Observed %s DSQ peek pids:\n", dsq_name);
for (int i = 0; i < max_samples; i++) {
long pid;
int err;
err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid);
if (err == 0) {
if (pid == 0) {
printf(" Sample %d: NULL peek\n", i);
} else if (pid > 0) {
printf(" Sample %d: pid %ld\n", i, pid);
count++;
}
} else {
printf(" Sample %d: error reading pid (err=%d)\n", i, err);
}
}
printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name);
return count;
}
static enum scx_test_status run(void *ctx)
{
struct peek_dsq *skel = ctx;
bool failed = false;
int seconds = 3;
int err;
/* Enable the scheduler to test DSQ operations */
printf("Enabling scheduler to test DSQ insert operations...\n");
struct bpf_link *link =
bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops);
if (!link) {
SCX_ERR("Failed to attach struct_ops");
return SCX_TEST_FAIL;
}
printf("Starting %d background workload threads...\n", NUM_WORKERS);
workload_running = true;
for (int i = 0; i < NUM_WORKERS; i++) {
err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL);
if (err) {
SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err));
/* Stop already created threads */
workload_running = false;
for (int j = 0; j < i; j++)
pthread_join(workload_threads[j], NULL);
bpf_link__destroy(link);
return SCX_TEST_FAIL;
}
}
printf("Waiting for enqueue events.\n");
sleep(seconds);
while (skel->data->enqueue_count <= 0) {
printf(".");
fflush(stdout);
sleep(1);
seconds++;
if (seconds >= 30) {
printf("\n\u2717 Timeout waiting for enqueue events\n");
/* Stop workload threads and cleanup */
workload_running = false;
for (int i = 0; i < NUM_WORKERS; i++)
pthread_join(workload_threads[i], NULL);
bpf_link__destroy(link);
return SCX_TEST_FAIL;
}
}
workload_running = false;
for (int i = 0; i < NUM_WORKERS; i++) {
err = pthread_join(workload_threads[i], NULL);
if (err) {
SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err));
bpf_link__destroy(link);
return SCX_TEST_FAIL;
}
}
printf("Background workload threads stopped.\n");
SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
/* Detach the scheduler */
bpf_link__destroy(link);
printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds,
skel->data->enqueue_count, skel->data->dispatch_count);
printf("Debug: ksym_exists=%d\n",
skel->bss->debug_ksym_exists);
/* Check DSQ insert result */
printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu);
if (skel->data->insert_test_cpu != -1)
printf("\u2713 DSQ insert succeeded !\n");
else {
printf("\u2717 DSQ insert failed or not attempted\n");
failed = true;
}
/* Check DSQ peek results */
printf(" DSQ peek result 1 (before insert): %d\n",
skel->data->dsq_peek_result1);
if (skel->data->dsq_peek_result1 == 0)
printf("\u2713 DSQ peek verification success: peek returned NULL!\n");
else {
printf("\u2717 DSQ peek verification failed\n");
failed = true;
}
printf(" DSQ peek result 2 (after insert): %ld\n",
skel->data->dsq_peek_result2);
printf(" DSQ peek result 2, expected: %ld\n",
skel->data->dsq_peek_result2_expected);
if (skel->data->dsq_peek_result2 ==
skel->data->dsq_peek_result2_expected)
printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n");
else {
printf("\u2717 DSQ peek verification failed\n");
failed = true;
}
printf(" Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid);
printf(" DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid);
int pid_count;
pid_count = print_observed_pids(skel->maps.peek_results,
skel->data->max_samples, "DSQ pool");
printf("Total non-null peek observations: %ld out of %ld\n",
skel->data->successful_peeks, skel->data->total_peek_attempts);
if (skel->bss->debug_ksym_exists && pid_count == 0) {
printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n");
failed = true;
}
if (skel->bss->debug_ksym_exists && pid_count > 0)
printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n");
if (failed)
return SCX_TEST_FAIL;
else
return SCX_TEST_PASS;
}
static void cleanup(void *ctx)
{
struct peek_dsq *skel = ctx;
if (workload_running) {
workload_running = false;
for (int i = 0; i < NUM_WORKERS; i++)
pthread_join(workload_threads[i], NULL);
}
peek_dsq__destroy(skel);
}
struct scx_test peek_dsq = {
.name = "peek_dsq",
.description =
"Test DSQ create/destroy operations and future peek functionality",
.setup = setup,
.run = run,
.cleanup = cleanup,
};
REGISTER_SCX_TEST(&peek_dsq)