mirror of https://github.com/torvalds/linux.git
sched_ext: Changes for v6.19
- Improve recovery from misbehaving BPF schedulers. When a scheduler puts many tasks with varying affinity restrictions on a shared DSQ, CPUs scanning through tasks they cannot run can overwhelm the system, causing lockups. Bypass mode now uses per-CPU DSQs with a load balancer to avoid this, and hooks into the hardlockup detector to attempt recovery. Add scx_cpu0 example scheduler to demonstrate this scenario. - Add lockless peek operation for DSQs to reduce lock contention for schedulers that need to query queue state during load balancing. - Allow scx_bpf_reenqueue_local() to be called from anywhere in preparation for deprecating cpu_acquire/release() callbacks in favor of generic BPF hooks. - Prepare for hierarchical scheduler support: add scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs, make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in structs for future aux__prog parameter. - Implement cgroup_set_idle() callback to notify BPF schedulers when a cgroup's idle state changes. - Fix migration tasks being incorrectly downgraded from stop_sched_class to rt_sched_class across sched_ext enable/disable. Applied late as the fix is low risk and the bug subtle but needs stable backporting. - Various fixes and cleanups including cgroup exit ordering, SCX_KICK_WAIT reliability, and backward compatibility improvements. -----BEGIN PGP SIGNATURE----- iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCaS4h1A4cdGpAa2VybmVs Lm9yZwAKCRCxYfJx3gVYGe/MAP9EZ0pLiTpmMtt6mI/11Fmi+aWfL84j1zt13cz9 W4vb4gEA9eVEH6n9xyC4nhcOk9AQwSDuCWMOzLsnhW8TbEHVTww= =8W/B -----END PGP SIGNATURE----- Merge tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext Pull sched_ext updates from Tejun Heo: - Improve recovery from misbehaving BPF schedulers. When a scheduler puts many tasks with varying affinity restrictions on a shared DSQ, CPUs scanning through tasks they cannot run can overwhelm the system, causing lockups. Bypass mode now uses per-CPU DSQs with a load balancer to avoid this, and hooks into the hardlockup detector to attempt recovery. Add scx_cpu0 example scheduler to demonstrate this scenario. - Add lockless peek operation for DSQs to reduce lock contention for schedulers that need to query queue state during load balancing. - Allow scx_bpf_reenqueue_local() to be called from anywhere in preparation for deprecating cpu_acquire/release() callbacks in favor of generic BPF hooks. - Prepare for hierarchical scheduler support: add scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs, make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in structs for future aux__prog parameter. - Implement cgroup_set_idle() callback to notify BPF schedulers when a cgroup's idle state changes. - Fix migration tasks being incorrectly downgraded from stop_sched_class to rt_sched_class across sched_ext enable/disable. Applied late as the fix is low risk and the bug subtle but needs stable backporting. - Various fixes and cleanups including cgroup exit ordering, SCX_KICK_WAIT reliability, and backward compatibility improvements. * tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (44 commits) sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks sched_ext: tools: Removing duplicate targets during non-cross compilation sched_ext: Use kvfree_rcu() to release per-cpu ksyncs object sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs sched_ext: Update comments replacing breather with aborting mechanism sched_ext: Implement load balancer for bypass mode sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked() sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR sched_ext: Add scx_cpu0 example scheduler sched_ext: Hook up hardlockup detector sched_ext: Make handle_lockup() propagate scx_verror() result sched_ext: Refactor lockup handlers into handle_lockup() sched_ext: Make scx_exit() and scx_vexit() return bool sched_ext: Exit dispatch and move operations immediately when aborting sched_ext: Simplify breather mechanism with scx_aborting flag sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode sched_ext: Refactor do_enqueue_task() local and global DSQ paths sched_ext: Use shorter slice in bypass mode sched_ext: Mark racy bitfields to prevent adding fields that can't tolerate races sched_ext: Minor cleanups to scx_task_iter ...
This commit is contained in:
commit
02baaa67d9
|
|
@ -17,7 +17,18 @@
|
|||
enum scx_public_consts {
|
||||
SCX_OPS_NAME_LEN = 128,
|
||||
|
||||
/*
|
||||
* %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses
|
||||
* to set the slice for a task that is selected for execution.
|
||||
* %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice
|
||||
* refill has been triggered.
|
||||
*
|
||||
* %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass
|
||||
* mode. As making forward progress for all tasks is the main goal of
|
||||
* the bypass mode, a shorter slice is used.
|
||||
*/
|
||||
SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
|
||||
SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */
|
||||
SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
|
||||
};
|
||||
|
||||
|
|
@ -46,6 +57,7 @@ enum scx_dsq_id_flags {
|
|||
SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
|
||||
SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
|
||||
SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
|
||||
SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3,
|
||||
SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
|
||||
SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
|
||||
};
|
||||
|
|
@ -58,6 +70,7 @@ enum scx_dsq_id_flags {
|
|||
*/
|
||||
struct scx_dispatch_q {
|
||||
raw_spinlock_t lock;
|
||||
struct task_struct __rcu *first_task; /* lockless peek at head */
|
||||
struct list_head list; /* tasks in dispatch order */
|
||||
struct rb_root priq; /* used to order by p->scx.dsq_vtime */
|
||||
u32 nr;
|
||||
|
|
@ -136,6 +149,13 @@ struct scx_dsq_list_node {
|
|||
u32 priv; /* can be used by iter cursor */
|
||||
};
|
||||
|
||||
#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \
|
||||
(struct scx_dsq_list_node) { \
|
||||
.node = LIST_HEAD_INIT((__node).node), \
|
||||
.flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \
|
||||
.priv = (__priv), \
|
||||
}
|
||||
|
||||
/*
|
||||
* The following is embedded in task_struct and contains all fields necessary
|
||||
* for a task to be scheduled by SCX.
|
||||
|
|
@ -207,16 +227,18 @@ struct sched_ext_entity {
|
|||
struct list_head tasks_node;
|
||||
};
|
||||
|
||||
void sched_ext_free(struct task_struct *p);
|
||||
void sched_ext_dead(struct task_struct *p);
|
||||
void print_scx_info(const char *log_lvl, struct task_struct *p);
|
||||
void scx_softlockup(u32 dur_s);
|
||||
bool scx_hardlockup(int cpu);
|
||||
bool scx_rcu_cpu_stall(void);
|
||||
|
||||
#else /* !CONFIG_SCHED_CLASS_EXT */
|
||||
|
||||
static inline void sched_ext_free(struct task_struct *p) {}
|
||||
static inline void sched_ext_dead(struct task_struct *p) {}
|
||||
static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
|
||||
static inline void scx_softlockup(u32 dur_s) {}
|
||||
static inline bool scx_hardlockup(int cpu) { return false; }
|
||||
static inline bool scx_rcu_cpu_stall(void) { return false; }
|
||||
|
||||
#endif /* CONFIG_SCHED_CLASS_EXT */
|
||||
|
|
@ -228,6 +250,7 @@ struct scx_task_group {
|
|||
u64 bw_period_us;
|
||||
u64 bw_quota_us;
|
||||
u64 bw_burst_us;
|
||||
bool idle;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -45,6 +45,45 @@ TRACE_EVENT(sched_ext_event,
|
|||
)
|
||||
);
|
||||
|
||||
TRACE_EVENT(sched_ext_bypass_lb,
|
||||
|
||||
TP_PROTO(__u32 node, __u32 nr_cpus, __u32 nr_tasks, __u32 nr_balanced,
|
||||
__u32 before_min, __u32 before_max,
|
||||
__u32 after_min, __u32 after_max),
|
||||
|
||||
TP_ARGS(node, nr_cpus, nr_tasks, nr_balanced,
|
||||
before_min, before_max, after_min, after_max),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field( __u32, node )
|
||||
__field( __u32, nr_cpus )
|
||||
__field( __u32, nr_tasks )
|
||||
__field( __u32, nr_balanced )
|
||||
__field( __u32, before_min )
|
||||
__field( __u32, before_max )
|
||||
__field( __u32, after_min )
|
||||
__field( __u32, after_max )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->node = node;
|
||||
__entry->nr_cpus = nr_cpus;
|
||||
__entry->nr_tasks = nr_tasks;
|
||||
__entry->nr_balanced = nr_balanced;
|
||||
__entry->before_min = before_min;
|
||||
__entry->before_max = before_max;
|
||||
__entry->after_min = after_min;
|
||||
__entry->after_max = after_max;
|
||||
),
|
||||
|
||||
TP_printk("node %u: nr_cpus=%u nr_tasks=%u nr_balanced=%u min=%u->%u max=%u->%u",
|
||||
__entry->node, __entry->nr_cpus,
|
||||
__entry->nr_tasks, __entry->nr_balanced,
|
||||
__entry->before_min, __entry->after_min,
|
||||
__entry->before_max, __entry->after_max
|
||||
)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_SCHED_EXT_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
|
|
|||
|
|
@ -736,7 +736,6 @@ void __put_task_struct(struct task_struct *tsk)
|
|||
WARN_ON(tsk == current);
|
||||
|
||||
unwind_task_free(tsk);
|
||||
sched_ext_free(tsk);
|
||||
io_uring_free(tsk);
|
||||
cgroup_task_free(tsk);
|
||||
task_numa_free(tsk, true);
|
||||
|
|
|
|||
|
|
@ -5143,6 +5143,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
|
|||
if (prev->sched_class->task_dead)
|
||||
prev->sched_class->task_dead(prev);
|
||||
|
||||
/*
|
||||
* sched_ext_dead() must come before cgroup_task_dead() to
|
||||
* prevent cgroups from being removed while its member tasks are
|
||||
* visible to SCX schedulers.
|
||||
*/
|
||||
sched_ext_dead(prev);
|
||||
cgroup_task_dead(prev);
|
||||
|
||||
/* Task is done with its stack. */
|
||||
|
|
|
|||
1099
kernel/sched/ext.c
1099
kernel/sched/ext.c
File diff suppressed because it is too large
Load Diff
|
|
@ -995,26 +995,56 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
|
|||
return prev_cpu;
|
||||
}
|
||||
|
||||
struct scx_bpf_select_cpu_and_args {
|
||||
/* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */
|
||||
s32 prev_cpu;
|
||||
u64 wake_flags;
|
||||
u64 flags;
|
||||
};
|
||||
|
||||
/**
|
||||
* scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p,
|
||||
* prioritizing those in @cpus_allowed
|
||||
* __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask
|
||||
* @p: task_struct to select a CPU for
|
||||
* @prev_cpu: CPU @p was on previously
|
||||
* @wake_flags: %SCX_WAKE_* flags
|
||||
* @cpus_allowed: cpumask of allowed CPUs
|
||||
* @flags: %SCX_PICK_IDLE* flags
|
||||
* @args: struct containing the rest of the arguments
|
||||
* @args->prev_cpu: CPU @p was on previously
|
||||
* @args->wake_flags: %SCX_WAKE_* flags
|
||||
* @args->flags: %SCX_PICK_IDLE* flags
|
||||
*
|
||||
* Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
|
||||
* limit. BPF programs should use scx_bpf_select_cpu_and() which is provided
|
||||
* as an inline wrapper in common.bpf.h.
|
||||
*
|
||||
* Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
|
||||
* context such as a BPF test_run() call, as long as built-in CPU selection
|
||||
* is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
|
||||
* is set.
|
||||
*
|
||||
* @p, @prev_cpu and @wake_flags match ops.select_cpu().
|
||||
* @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu().
|
||||
*
|
||||
* Returns the selected idle CPU, which will be automatically awakened upon
|
||||
* returning from ops.select_cpu() and can be used for direct dispatch, or
|
||||
* a negative value if no idle CPU is available.
|
||||
*/
|
||||
__bpf_kfunc s32
|
||||
__scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
|
||||
struct scx_bpf_select_cpu_and_args *args)
|
||||
{
|
||||
struct scx_sched *sch;
|
||||
|
||||
guard(rcu)();
|
||||
|
||||
sch = rcu_dereference(scx_root);
|
||||
if (unlikely(!sch))
|
||||
return -ENODEV;
|
||||
|
||||
return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags,
|
||||
cpus_allowed, args->flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* COMPAT: Will be removed in v6.22.
|
||||
*/
|
||||
__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
|
||||
const struct cpumask *cpus_allowed, u64 flags)
|
||||
{
|
||||
|
|
@ -1383,6 +1413,7 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
|
|||
BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
|
||||
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
|
||||
BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
|
||||
BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU)
|
||||
BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
|
||||
BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
|
||||
BTF_KFUNCS_END(scx_kfunc_ids_idle)
|
||||
|
|
|
|||
|
|
@ -23,6 +23,11 @@ enum scx_consts {
|
|||
* scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
|
||||
*/
|
||||
SCX_TASK_ITER_BATCH = 32,
|
||||
|
||||
SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
|
||||
SCX_BYPASS_LB_DONOR_PCT = 125,
|
||||
SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
|
||||
SCX_BYPASS_LB_BATCH = 256,
|
||||
};
|
||||
|
||||
enum scx_exit_kind {
|
||||
|
|
@ -697,12 +702,23 @@ struct sched_ext_ops {
|
|||
* 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
|
||||
* interpreted in the same fashion and specifies how much @cgrp can
|
||||
* burst temporarily. The specific control mechanism and thus the
|
||||
* interpretation of @period_us and burstiness is upto to the BPF
|
||||
* interpretation of @period_us and burstiness is up to the BPF
|
||||
* scheduler.
|
||||
*/
|
||||
void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
|
||||
u64 period_us, u64 quota_us, u64 burst_us);
|
||||
|
||||
/**
|
||||
* @cgroup_set_idle: A cgroup's idle state is being changed
|
||||
* @cgrp: cgroup whose idle state is being updated
|
||||
* @idle: whether the cgroup is entering or exiting idle state
|
||||
*
|
||||
* Update @cgrp's idle state to @idle. This callback is invoked when
|
||||
* a cgroup transitions between idle and non-idle states, allowing the
|
||||
* BPF scheduler to adjust its behavior accordingly.
|
||||
*/
|
||||
void (*cgroup_set_idle)(struct cgroup *cgrp, bool idle);
|
||||
|
||||
#endif /* CONFIG_EXT_GROUP_SCHED */
|
||||
|
||||
/*
|
||||
|
|
@ -884,6 +900,10 @@ struct scx_sched {
|
|||
struct scx_dispatch_q **global_dsqs;
|
||||
struct scx_sched_pcpu __percpu *pcpu;
|
||||
|
||||
/*
|
||||
* Updates to the following warned bitfields can race causing RMW issues
|
||||
* but it doesn't really matter.
|
||||
*/
|
||||
bool warned_zero_slice:1;
|
||||
bool warned_deprecated_rq:1;
|
||||
|
||||
|
|
@ -948,6 +968,7 @@ enum scx_enq_flags {
|
|||
|
||||
SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
|
||||
SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
|
||||
SCX_ENQ_NESTED = 1LLU << 58,
|
||||
};
|
||||
|
||||
enum scx_deq_flags {
|
||||
|
|
@ -986,8 +1007,10 @@ enum scx_kick_flags {
|
|||
SCX_KICK_PREEMPT = 1LLU << 1,
|
||||
|
||||
/*
|
||||
* Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
|
||||
* return after the target CPU finishes picking the next task.
|
||||
* The scx_bpf_kick_cpu() call will return after the current SCX task of
|
||||
* the target CPU switches out. This can be used to implement e.g. core
|
||||
* scheduling. This has no effect if the current task on the target CPU
|
||||
* is not on SCX.
|
||||
*/
|
||||
SCX_KICK_WAIT = 1LLU << 2,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -803,10 +803,12 @@ struct scx_rq {
|
|||
cpumask_var_t cpus_to_kick_if_idle;
|
||||
cpumask_var_t cpus_to_preempt;
|
||||
cpumask_var_t cpus_to_wait;
|
||||
unsigned long pnt_seq;
|
||||
unsigned long kick_sync;
|
||||
local_t reenq_local_deferred;
|
||||
struct balance_callback deferred_bal_cb;
|
||||
struct irq_work deferred_irq_work;
|
||||
struct irq_work kick_cpus_irq_work;
|
||||
struct scx_dispatch_q bypass_dsq;
|
||||
};
|
||||
#endif /* CONFIG_SCHED_CLASS_EXT */
|
||||
|
||||
|
|
|
|||
|
|
@ -196,6 +196,15 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
|
|||
#ifdef CONFIG_SYSFS
|
||||
++hardlockup_count;
|
||||
#endif
|
||||
/*
|
||||
* A poorly behaving BPF scheduler can trigger hard lockup by
|
||||
* e.g. putting numerous affinitized tasks in a single queue and
|
||||
* directing all CPUs at it. The following call can return true
|
||||
* only once when sched_ext is enabled and will immediately
|
||||
* abort the BPF scheduler and print out a warning message.
|
||||
*/
|
||||
if (scx_hardlockup(cpu))
|
||||
return;
|
||||
|
||||
/* Only print hardlockups once. */
|
||||
if (per_cpu(watchdog_hardlockup_warned, cpu))
|
||||
|
|
|
|||
|
|
@ -133,6 +133,7 @@ $(MAKE_DIRS):
|
|||
$(call msg,MKDIR,,$@)
|
||||
$(Q)mkdir -p $@
|
||||
|
||||
ifneq ($(CROSS_COMPILE),)
|
||||
$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
|
||||
$(APIDIR)/linux/bpf.h \
|
||||
| $(OBJ_DIR)/libbpf
|
||||
|
|
@ -141,6 +142,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
|
|||
EXTRA_CFLAGS='-g -O0 -fPIC' \
|
||||
LDFLAGS="$(LDFLAGS)" \
|
||||
DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
|
||||
endif
|
||||
|
||||
$(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
|
||||
$(APIDIR)/linux/bpf.h \
|
||||
|
|
@ -187,7 +189,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP
|
|||
|
||||
SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
|
||||
|
||||
c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg
|
||||
c-sched-targets = scx_simple scx_cpu0 scx_qmap scx_central scx_flatcg
|
||||
|
||||
$(addprefix $(BINDIR)/,$(c-sched-targets)): \
|
||||
$(BINDIR)/%: \
|
||||
|
|
|
|||
|
|
@ -60,21 +60,15 @@ static inline void ___vmlinux_h_sanity_check___(void)
|
|||
|
||||
s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
|
||||
s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
|
||||
s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
|
||||
const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
|
||||
void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
|
||||
void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
|
||||
s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
|
||||
struct scx_bpf_select_cpu_and_args *args) __ksym __weak;
|
||||
bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak;
|
||||
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
|
||||
void scx_bpf_dispatch_cancel(void) __ksym;
|
||||
bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak;
|
||||
void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
|
||||
void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
|
||||
bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
|
||||
bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
|
||||
u32 scx_bpf_reenqueue_local(void) __ksym;
|
||||
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
|
||||
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
|
||||
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
|
||||
struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak;
|
||||
int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
|
||||
struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
|
||||
void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
|
||||
|
|
@ -105,7 +99,6 @@ s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
|
|||
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
|
||||
struct rq *scx_bpf_locked_rq(void) __ksym;
|
||||
struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
|
||||
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
|
||||
u64 scx_bpf_now(void) __ksym __weak;
|
||||
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
|
||||
|
||||
|
|
|
|||
|
|
@ -16,119 +16,92 @@
|
|||
})
|
||||
|
||||
/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */
|
||||
#define __COMPAT_scx_bpf_task_cgroup(p) \
|
||||
(bpf_ksym_exists(scx_bpf_task_cgroup) ? \
|
||||
scx_bpf_task_cgroup((p)) : NULL)
|
||||
struct cgroup *scx_bpf_task_cgroup___new(struct task_struct *p) __ksym __weak;
|
||||
|
||||
#define scx_bpf_task_cgroup(p) \
|
||||
(bpf_ksym_exists(scx_bpf_task_cgroup___new) ? \
|
||||
scx_bpf_task_cgroup___new((p)) : NULL)
|
||||
|
||||
/*
|
||||
* v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are
|
||||
* renamed to unload the verb.
|
||||
*
|
||||
* Build error is triggered if old names are used. New binaries work with both
|
||||
* new and old names. The compat macros will be removed on v6.15 release.
|
||||
*
|
||||
* scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by
|
||||
* 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()").
|
||||
* Preserve __COMPAT macros until v6.15.
|
||||
*/
|
||||
void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
|
||||
void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
|
||||
bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak;
|
||||
void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
|
||||
void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
|
||||
bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
|
||||
bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
|
||||
int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
|
||||
bool scx_bpf_dsq_move_to_local___new(u64 dsq_id) __ksym __weak;
|
||||
void scx_bpf_dsq_move_set_slice___new(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
|
||||
void scx_bpf_dsq_move_set_vtime___new(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
|
||||
bool scx_bpf_dsq_move___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
|
||||
bool scx_bpf_dsq_move_vtime___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
|
||||
|
||||
#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \
|
||||
(bpf_ksym_exists(scx_bpf_dsq_insert) ? \
|
||||
scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) : \
|
||||
scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags)))
|
||||
|
||||
#define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags) \
|
||||
(bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ? \
|
||||
scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \
|
||||
scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags)))
|
||||
bool scx_bpf_consume___old(u64 dsq_id) __ksym __weak;
|
||||
void scx_bpf_dispatch_from_dsq_set_slice___old(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
|
||||
void scx_bpf_dispatch_from_dsq_set_vtime___old(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
|
||||
bool scx_bpf_dispatch_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
|
||||
bool scx_bpf_dispatch_vtime_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
|
||||
|
||||
#define scx_bpf_dsq_move_to_local(dsq_id) \
|
||||
(bpf_ksym_exists(scx_bpf_dsq_move_to_local) ? \
|
||||
scx_bpf_dsq_move_to_local((dsq_id)) : \
|
||||
scx_bpf_consume___compat((dsq_id)))
|
||||
(bpf_ksym_exists(scx_bpf_dsq_move_to_local___new) ? \
|
||||
scx_bpf_dsq_move_to_local___new((dsq_id)) : \
|
||||
scx_bpf_consume___old((dsq_id)))
|
||||
|
||||
#define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice) \
|
||||
(bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ? \
|
||||
scx_bpf_dsq_move_set_slice((it__iter), (slice)) : \
|
||||
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ? \
|
||||
scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) : \
|
||||
#define scx_bpf_dsq_move_set_slice(it__iter, slice) \
|
||||
(bpf_ksym_exists(scx_bpf_dsq_move_set_slice___new) ? \
|
||||
scx_bpf_dsq_move_set_slice___new((it__iter), (slice)) : \
|
||||
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___old) ? \
|
||||
scx_bpf_dispatch_from_dsq_set_slice___old((it__iter), (slice)) : \
|
||||
(void)0))
|
||||
|
||||
#define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime) \
|
||||
(bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ? \
|
||||
scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) : \
|
||||
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ? \
|
||||
scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) : \
|
||||
(void) 0))
|
||||
#define scx_bpf_dsq_move_set_vtime(it__iter, vtime) \
|
||||
(bpf_ksym_exists(scx_bpf_dsq_move_set_vtime___new) ? \
|
||||
scx_bpf_dsq_move_set_vtime___new((it__iter), (vtime)) : \
|
||||
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___old) ? \
|
||||
scx_bpf_dispatch_from_dsq_set_vtime___old((it__iter), (vtime)) : \
|
||||
(void)0))
|
||||
|
||||
#define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \
|
||||
(bpf_ksym_exists(scx_bpf_dsq_move) ? \
|
||||
scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) : \
|
||||
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ? \
|
||||
scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
|
||||
#define scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \
|
||||
(bpf_ksym_exists(scx_bpf_dsq_move___new) ? \
|
||||
scx_bpf_dsq_move___new((it__iter), (p), (dsq_id), (enq_flags)) : \
|
||||
(bpf_ksym_exists(scx_bpf_dispatch_from_dsq___old) ? \
|
||||
scx_bpf_dispatch_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \
|
||||
false))
|
||||
|
||||
#define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \
|
||||
(bpf_ksym_exists(scx_bpf_dsq_move_vtime) ? \
|
||||
scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \
|
||||
(bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ? \
|
||||
scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
|
||||
#define scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \
|
||||
(bpf_ksym_exists(scx_bpf_dsq_move_vtime___new) ? \
|
||||
scx_bpf_dsq_move_vtime___new((it__iter), (p), (dsq_id), (enq_flags)) : \
|
||||
(bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___old) ? \
|
||||
scx_bpf_dispatch_vtime_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \
|
||||
false))
|
||||
|
||||
/*
|
||||
* v6.15: 950ad93df2fc ("bpf: add kfunc for populating cpumask bits")
|
||||
*
|
||||
* Compat macro will be dropped on v6.19 release.
|
||||
*/
|
||||
int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
|
||||
|
||||
#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \
|
||||
(bpf_ksym_exists(bpf_cpumask_populate) ? \
|
||||
(bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP)
|
||||
|
||||
#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \
|
||||
_Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
|
||||
/*
|
||||
* v6.19: Introduce lockless peek API for user DSQs.
|
||||
*
|
||||
* Preserve the following macro until v6.21.
|
||||
*/
|
||||
static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id)
|
||||
{
|
||||
struct task_struct *p = NULL;
|
||||
struct bpf_iter_scx_dsq it;
|
||||
|
||||
#define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags) \
|
||||
_Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()")
|
||||
|
||||
#define scx_bpf_consume(dsq_id) ({ \
|
||||
_Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \
|
||||
false; \
|
||||
})
|
||||
|
||||
#define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \
|
||||
_Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()")
|
||||
|
||||
#define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \
|
||||
_Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()")
|
||||
|
||||
#define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
|
||||
_Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \
|
||||
false; \
|
||||
})
|
||||
|
||||
#define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
|
||||
_Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \
|
||||
false; \
|
||||
})
|
||||
|
||||
#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \
|
||||
_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()")
|
||||
|
||||
#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \
|
||||
_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()")
|
||||
|
||||
#define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
|
||||
_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \
|
||||
false; \
|
||||
})
|
||||
|
||||
#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \
|
||||
_Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \
|
||||
false; \
|
||||
})
|
||||
if (bpf_ksym_exists(scx_bpf_dsq_peek))
|
||||
return scx_bpf_dsq_peek(dsq_id);
|
||||
if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0))
|
||||
p = bpf_iter_scx_dsq_next(&it);
|
||||
bpf_iter_scx_dsq_destroy(&it);
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
|
||||
|
|
@ -247,6 +220,161 @@ static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu)
|
|||
return rq ? rq->curr : NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* v6.19: To work around BPF maximum parameter limit, the following kfuncs are
|
||||
* replaced with variants that pack scalar arguments in a struct. Wrappers are
|
||||
* provided to maintain source compatibility.
|
||||
*
|
||||
* v6.13: scx_bpf_dsq_insert_vtime() renaming is also handled here. See the
|
||||
* block on dispatch renaming above for more details.
|
||||
*
|
||||
* The kernel will carry the compat variants until v6.23 to maintain binary
|
||||
* compatibility. After v6.23 release, remove the compat handling and move the
|
||||
* wrappers to common.bpf.h.
|
||||
*/
|
||||
s32 scx_bpf_select_cpu_and___compat(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
|
||||
const struct cpumask *cpus_allowed, u64 flags) __ksym __weak;
|
||||
void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
|
||||
void scx_bpf_dsq_insert_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
|
||||
|
||||
/**
|
||||
* scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p
|
||||
* @p: task_struct to select a CPU for
|
||||
* @prev_cpu: CPU @p was on previously
|
||||
* @wake_flags: %SCX_WAKE_* flags
|
||||
* @cpus_allowed: cpumask of allowed CPUs
|
||||
* @flags: %SCX_PICK_IDLE* flags
|
||||
*
|
||||
* Inline wrapper that packs scalar arguments into a struct and calls
|
||||
* __scx_bpf_select_cpu_and(). See __scx_bpf_select_cpu_and() for details.
|
||||
*/
|
||||
static inline s32
|
||||
scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
|
||||
const struct cpumask *cpus_allowed, u64 flags)
|
||||
{
|
||||
if (bpf_core_type_exists(struct scx_bpf_select_cpu_and_args)) {
|
||||
struct scx_bpf_select_cpu_and_args args = {
|
||||
.prev_cpu = prev_cpu,
|
||||
.wake_flags = wake_flags,
|
||||
.flags = flags,
|
||||
};
|
||||
|
||||
return __scx_bpf_select_cpu_and(p, cpus_allowed, &args);
|
||||
} else {
|
||||
return scx_bpf_select_cpu_and___compat(p, prev_cpu, wake_flags,
|
||||
cpus_allowed, flags);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
|
||||
* @p: task_struct to insert
|
||||
* @dsq_id: DSQ to insert into
|
||||
* @slice: duration @p can run for in nsecs, 0 to keep the current value
|
||||
* @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
|
||||
* @enq_flags: SCX_ENQ_*
|
||||
*
|
||||
* Inline wrapper that packs scalar arguments into a struct and calls
|
||||
* __scx_bpf_dsq_insert_vtime(). See __scx_bpf_dsq_insert_vtime() for details.
|
||||
*/
|
||||
static inline bool
|
||||
scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime,
|
||||
u64 enq_flags)
|
||||
{
|
||||
if (bpf_core_type_exists(struct scx_bpf_dsq_insert_vtime_args)) {
|
||||
struct scx_bpf_dsq_insert_vtime_args args = {
|
||||
.dsq_id = dsq_id,
|
||||
.slice = slice,
|
||||
.vtime = vtime,
|
||||
.enq_flags = enq_flags,
|
||||
};
|
||||
|
||||
return __scx_bpf_dsq_insert_vtime(p, &args);
|
||||
} else if (bpf_ksym_exists(scx_bpf_dsq_insert_vtime___compat)) {
|
||||
scx_bpf_dsq_insert_vtime___compat(p, dsq_id, slice, vtime,
|
||||
enq_flags);
|
||||
return true;
|
||||
} else {
|
||||
scx_bpf_dispatch_vtime___compat(p, dsq_id, slice, vtime,
|
||||
enq_flags);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* v6.19: scx_bpf_dsq_insert() now returns bool instead of void. Move
|
||||
* scx_bpf_dsq_insert() decl to common.bpf.h and drop compat helper after v6.22.
|
||||
* The extra ___compat suffix is to work around libbpf not ignoring __SUFFIX on
|
||||
* kernel side. The entire suffix can be dropped later.
|
||||
*
|
||||
* v6.13: scx_bpf_dsq_insert() renaming is also handled here. See the block on
|
||||
* dispatch renaming above for more details.
|
||||
*/
|
||||
bool scx_bpf_dsq_insert___v2___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
|
||||
void scx_bpf_dsq_insert___v1(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
|
||||
void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
|
||||
|
||||
static inline bool
|
||||
scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags)
|
||||
{
|
||||
if (bpf_ksym_exists(scx_bpf_dsq_insert___v2___compat)) {
|
||||
return scx_bpf_dsq_insert___v2___compat(p, dsq_id, slice, enq_flags);
|
||||
} else if (bpf_ksym_exists(scx_bpf_dsq_insert___v1)) {
|
||||
scx_bpf_dsq_insert___v1(p, dsq_id, slice, enq_flags);
|
||||
return true;
|
||||
} else {
|
||||
scx_bpf_dispatch___compat(p, dsq_id, slice, enq_flags);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* v6.19: scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() added to for
|
||||
* sub-sched authority checks. Drop the wrappers and move the decls to
|
||||
* common.bpf.h after v6.22.
|
||||
*/
|
||||
bool scx_bpf_task_set_slice___new(struct task_struct *p, u64 slice) __ksym __weak;
|
||||
bool scx_bpf_task_set_dsq_vtime___new(struct task_struct *p, u64 vtime) __ksym __weak;
|
||||
|
||||
static inline void scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
|
||||
{
|
||||
if (bpf_ksym_exists(scx_bpf_task_set_slice___new))
|
||||
scx_bpf_task_set_slice___new(p, slice);
|
||||
else
|
||||
p->scx.slice = slice;
|
||||
}
|
||||
|
||||
static inline void scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
|
||||
{
|
||||
if (bpf_ksym_exists(scx_bpf_task_set_dsq_vtime___new))
|
||||
scx_bpf_task_set_dsq_vtime___new(p, vtime);
|
||||
else
|
||||
p->scx.dsq_vtime = vtime;
|
||||
}
|
||||
|
||||
/*
|
||||
* v6.19: The new void variant can be called from anywhere while the older v1
|
||||
* variant can only be called from ops.cpu_release(). The double ___ prefixes on
|
||||
* the v2 variant need to be removed once libbpf is updated to ignore ___ prefix
|
||||
* on kernel side. Drop the wrapper and move the decl to common.bpf.h after
|
||||
* v6.22.
|
||||
*/
|
||||
u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak;
|
||||
void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak;
|
||||
|
||||
static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void)
|
||||
{
|
||||
return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat);
|
||||
}
|
||||
|
||||
static inline void scx_bpf_reenqueue_local(void)
|
||||
{
|
||||
if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
|
||||
scx_bpf_reenqueue_local___v2___compat();
|
||||
else
|
||||
scx_bpf_reenqueue_local___v1();
|
||||
}
|
||||
|
||||
/*
|
||||
* Define sched_ext_ops. This may be expanded to define multiple variants for
|
||||
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
|
||||
|
|
|
|||
|
|
@ -151,6 +151,10 @@ static inline long scx_hotplug_seq(void)
|
|||
*
|
||||
* ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
|
||||
* the current minimum required kernel version.
|
||||
*
|
||||
* COMPAT:
|
||||
* - v6.17: ops.cgroup_set_bandwidth()
|
||||
* - v6.19: ops.cgroup_set_idle()
|
||||
*/
|
||||
#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
|
||||
struct __scx_name *__skel; \
|
||||
|
|
@ -162,6 +166,16 @@ static inline long scx_hotplug_seq(void)
|
|||
SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
|
||||
__skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \
|
||||
SCX_ENUM_INIT(__skel); \
|
||||
if (__skel->struct_ops.__ops_name->cgroup_set_bandwidth && \
|
||||
!__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_bandwidth")) { \
|
||||
fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_bandwidth()\n"); \
|
||||
__skel->struct_ops.__ops_name->cgroup_set_bandwidth = NULL; \
|
||||
} \
|
||||
if (__skel->struct_ops.__ops_name->cgroup_set_idle && \
|
||||
!__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_idle")) { \
|
||||
fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_idle()\n"); \
|
||||
__skel->struct_ops.__ops_name->cgroup_set_idle = NULL; \
|
||||
} \
|
||||
__skel; \
|
||||
})
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,88 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* A CPU0 scheduler.
|
||||
*
|
||||
* This scheduler queues all tasks to a shared DSQ and only dispatches them on
|
||||
* CPU0 in FIFO order. This is useful for testing bypass behavior when many
|
||||
* tasks are concentrated on a single CPU. If the load balancer doesn't work,
|
||||
* bypass mode can trigger task hangs or RCU stalls as the queue is long and
|
||||
* there's only one CPU working on it.
|
||||
*
|
||||
* - Statistics tracking how many tasks are queued to local and CPU0 DSQs.
|
||||
* - Termination notification for userspace.
|
||||
*
|
||||
* Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
|
||||
* Copyright (c) 2025 Tejun Heo <tj@kernel.org>
|
||||
*/
|
||||
#include <scx/common.bpf.h>
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */
|
||||
|
||||
UEI_DEFINE(uei);
|
||||
|
||||
/*
|
||||
* We create a custom DSQ with ID 0 that we dispatch to and consume from on
|
||||
* CPU0.
|
||||
*/
|
||||
#define DSQ_CPU0 0
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||
__uint(key_size, sizeof(u32));
|
||||
__uint(value_size, sizeof(u64));
|
||||
__uint(max_entries, 2); /* [local, cpu0] */
|
||||
} stats SEC(".maps");
|
||||
|
||||
static void stat_inc(u32 idx)
|
||||
{
|
||||
u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
|
||||
if (cnt_p)
|
||||
(*cnt_p)++;
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS(cpu0_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(cpu0_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
{
|
||||
/*
|
||||
* select_cpu() always picks CPU0. If @p is not on CPU0, it can't run on
|
||||
* CPU 0. Queue on whichever CPU it's currently only.
|
||||
*/
|
||||
if (scx_bpf_task_cpu(p) != 0) {
|
||||
stat_inc(0); /* count local queueing */
|
||||
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
stat_inc(1); /* count cpu0 queueing */
|
||||
scx_bpf_dsq_insert(p, DSQ_CPU0, SCX_SLICE_DFL, enq_flags);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev)
|
||||
{
|
||||
if (cpu == 0)
|
||||
scx_bpf_dsq_move_to_local(DSQ_CPU0);
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init)
|
||||
{
|
||||
return scx_bpf_create_dsq(DSQ_CPU0, -1);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(cpu0_exit, struct scx_exit_info *ei)
|
||||
{
|
||||
UEI_RECORD(uei, ei);
|
||||
}
|
||||
|
||||
SCX_OPS_DEFINE(cpu0_ops,
|
||||
.select_cpu = (void *)cpu0_select_cpu,
|
||||
.enqueue = (void *)cpu0_enqueue,
|
||||
.dispatch = (void *)cpu0_dispatch,
|
||||
.init = (void *)cpu0_init,
|
||||
.exit = (void *)cpu0_exit,
|
||||
.name = "cpu0");
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
|
||||
* Copyright (c) 2025 Tejun Heo <tj@kernel.org>
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <assert.h>
|
||||
#include <libgen.h>
|
||||
#include <bpf/bpf.h>
|
||||
#include <scx/common.h>
|
||||
#include "scx_cpu0.bpf.skel.h"
|
||||
|
||||
const char help_fmt[] =
|
||||
"A cpu0 sched_ext scheduler.\n"
|
||||
"\n"
|
||||
"See the top-level comment in .bpf.c for more details.\n"
|
||||
"\n"
|
||||
"Usage: %s [-v]\n"
|
||||
"\n"
|
||||
" -v Print libbpf debug messages\n"
|
||||
" -h Display this help and exit\n";
|
||||
|
||||
static bool verbose;
|
||||
static volatile int exit_req;
|
||||
|
||||
static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
|
||||
{
|
||||
if (level == LIBBPF_DEBUG && !verbose)
|
||||
return 0;
|
||||
return vfprintf(stderr, format, args);
|
||||
}
|
||||
|
||||
static void sigint_handler(int sig)
|
||||
{
|
||||
exit_req = 1;
|
||||
}
|
||||
|
||||
static void read_stats(struct scx_cpu0 *skel, __u64 *stats)
|
||||
{
|
||||
int nr_cpus = libbpf_num_possible_cpus();
|
||||
assert(nr_cpus > 0);
|
||||
__u64 cnts[2][nr_cpus];
|
||||
__u32 idx;
|
||||
|
||||
memset(stats, 0, sizeof(stats[0]) * 2);
|
||||
|
||||
for (idx = 0; idx < 2; idx++) {
|
||||
int ret, cpu;
|
||||
|
||||
ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
|
||||
&idx, cnts[idx]);
|
||||
if (ret < 0)
|
||||
continue;
|
||||
for (cpu = 0; cpu < nr_cpus; cpu++)
|
||||
stats[idx] += cnts[idx][cpu];
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct scx_cpu0 *skel;
|
||||
struct bpf_link *link;
|
||||
__u32 opt;
|
||||
__u64 ecode;
|
||||
|
||||
libbpf_set_print(libbpf_print_fn);
|
||||
signal(SIGINT, sigint_handler);
|
||||
signal(SIGTERM, sigint_handler);
|
||||
restart:
|
||||
skel = SCX_OPS_OPEN(cpu0_ops, scx_cpu0);
|
||||
|
||||
skel->rodata->nr_cpus = libbpf_num_possible_cpus();
|
||||
|
||||
while ((opt = getopt(argc, argv, "vh")) != -1) {
|
||||
switch (opt) {
|
||||
case 'v':
|
||||
verbose = true;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, help_fmt, basename(argv[0]));
|
||||
return opt != 'h';
|
||||
}
|
||||
}
|
||||
|
||||
SCX_OPS_LOAD(skel, cpu0_ops, scx_cpu0, uei);
|
||||
link = SCX_OPS_ATTACH(skel, cpu0_ops, scx_cpu0);
|
||||
|
||||
while (!exit_req && !UEI_EXITED(skel, uei)) {
|
||||
__u64 stats[2];
|
||||
|
||||
read_stats(skel, stats);
|
||||
printf("local=%llu cpu0=%llu\n", stats[0], stats[1]);
|
||||
fflush(stdout);
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
bpf_link__destroy(link);
|
||||
ecode = UEI_REPORT(skel, uei);
|
||||
scx_cpu0__destroy(skel);
|
||||
|
||||
if (UEI_ECODE_RESTART(ecode))
|
||||
goto restart;
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -382,7 +382,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
|
|||
return;
|
||||
}
|
||||
|
||||
cgrp = __COMPAT_scx_bpf_task_cgroup(p);
|
||||
cgrp = scx_bpf_task_cgroup(p);
|
||||
cgc = find_cgrp_ctx(cgrp);
|
||||
if (!cgc)
|
||||
goto out_release;
|
||||
|
|
@ -508,7 +508,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
|
|||
{
|
||||
struct cgroup *cgrp;
|
||||
|
||||
cgrp = __COMPAT_scx_bpf_task_cgroup(p);
|
||||
cgrp = scx_bpf_task_cgroup(p);
|
||||
update_active_weight_sums(cgrp, true);
|
||||
bpf_cgroup_release(cgrp);
|
||||
}
|
||||
|
|
@ -521,7 +521,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
|
|||
if (fifo_sched)
|
||||
return;
|
||||
|
||||
cgrp = __COMPAT_scx_bpf_task_cgroup(p);
|
||||
cgrp = scx_bpf_task_cgroup(p);
|
||||
cgc = find_cgrp_ctx(cgrp);
|
||||
if (cgc) {
|
||||
/*
|
||||
|
|
@ -564,7 +564,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
|
|||
if (!taskc->bypassed_at)
|
||||
return;
|
||||
|
||||
cgrp = __COMPAT_scx_bpf_task_cgroup(p);
|
||||
cgrp = scx_bpf_task_cgroup(p);
|
||||
cgc = find_cgrp_ctx(cgrp);
|
||||
if (cgc) {
|
||||
__sync_fetch_and_add(&cgc->cvtime_delta,
|
||||
|
|
@ -578,7 +578,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
|
|||
{
|
||||
struct cgroup *cgrp;
|
||||
|
||||
cgrp = __COMPAT_scx_bpf_task_cgroup(p);
|
||||
cgrp = scx_bpf_task_cgroup(p);
|
||||
update_active_weight_sums(cgrp, false);
|
||||
bpf_cgroup_release(cgrp);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -202,6 +202,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
|
|||
void *ring;
|
||||
s32 cpu;
|
||||
|
||||
if (enq_flags & SCX_ENQ_REENQ)
|
||||
__sync_fetch_and_add(&nr_reenqueued, 1);
|
||||
|
||||
if (p->flags & PF_KTHREAD) {
|
||||
if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
|
||||
return;
|
||||
|
|
@ -320,12 +323,9 @@ static bool dispatch_highpri(bool from_timer)
|
|||
|
||||
if (tctx->highpri) {
|
||||
/* exercise the set_*() and vtime interface too */
|
||||
__COMPAT_scx_bpf_dsq_move_set_slice(
|
||||
BPF_FOR_EACH_ITER, slice_ns * 2);
|
||||
__COMPAT_scx_bpf_dsq_move_set_vtime(
|
||||
BPF_FOR_EACH_ITER, highpri_seq++);
|
||||
__COMPAT_scx_bpf_dsq_move_vtime(
|
||||
BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
|
||||
scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
|
||||
scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
|
||||
scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -342,9 +342,8 @@ static bool dispatch_highpri(bool from_timer)
|
|||
else
|
||||
cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
|
||||
|
||||
if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
|
||||
SCX_DSQ_LOCAL_ON | cpu,
|
||||
SCX_ENQ_PREEMPT)) {
|
||||
if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
|
||||
SCX_ENQ_PREEMPT)) {
|
||||
if (cpu == this_cpu) {
|
||||
dispatched = true;
|
||||
__sync_fetch_and_add(&nr_expedited_local, 1);
|
||||
|
|
@ -533,20 +532,35 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
|
|||
return task_qdist(a) > task_qdist(b);
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
|
||||
SEC("tp_btf/sched_switch")
|
||||
int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
|
||||
struct task_struct *next, unsigned long prev_state)
|
||||
{
|
||||
u32 cnt;
|
||||
if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Called when @cpu is taken by a higher priority scheduling class. This
|
||||
* makes @cpu no longer available for executing sched_ext tasks. As we
|
||||
* don't want the tasks in @cpu's local dsq to sit there until @cpu
|
||||
* becomes available again, re-enqueue them into the global dsq. See
|
||||
* %SCX_ENQ_REENQ handling in qmap_enqueue().
|
||||
* If @cpu is taken by a higher priority scheduling class, it is no
|
||||
* longer available for executing sched_ext tasks. As we don't want the
|
||||
* tasks in @cpu's local dsq to sit there until @cpu becomes available
|
||||
* again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ
|
||||
* handling in qmap_enqueue().
|
||||
*/
|
||||
cnt = scx_bpf_reenqueue_local();
|
||||
if (cnt)
|
||||
__sync_fetch_and_add(&nr_reenqueued, cnt);
|
||||
switch (next->policy) {
|
||||
case 1: /* SCHED_FIFO */
|
||||
case 2: /* SCHED_RR */
|
||||
case 6: /* SCHED_DEADLINE */
|
||||
scx_bpf_reenqueue_local();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
|
||||
{
|
||||
/* see qmap_sched_switch() to learn how to do this on newer kernels */
|
||||
if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
|
||||
scx_bpf_reenqueue_local();
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
|
||||
|
|
|
|||
|
|
@ -174,6 +174,7 @@ auto-test-targets := \
|
|||
minimal \
|
||||
numa \
|
||||
allowed_cpus \
|
||||
peek_dsq \
|
||||
prog_run \
|
||||
reload_loop \
|
||||
select_cpu_dfl \
|
||||
|
|
|
|||
|
|
@ -0,0 +1,251 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* A BPF program for testing DSQ operations and peek in particular.
|
||||
*
|
||||
* Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
|
||||
* Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
|
||||
*/
|
||||
|
||||
#include <scx/common.bpf.h>
|
||||
#include <scx/compat.bpf.h>
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
UEI_DEFINE(uei); /* Error handling */
|
||||
|
||||
#define MAX_SAMPLES 100
|
||||
#define MAX_CPUS 512
|
||||
#define DSQ_POOL_SIZE 8
|
||||
int max_samples = MAX_SAMPLES;
|
||||
int max_cpus = MAX_CPUS;
|
||||
int dsq_pool_size = DSQ_POOL_SIZE;
|
||||
|
||||
/* Global variables to store test results */
|
||||
int dsq_peek_result1 = -1;
|
||||
long dsq_inserted_pid = -1;
|
||||
int insert_test_cpu = -1; /* Set to the cpu that performs the test */
|
||||
long dsq_peek_result2 = -1;
|
||||
long dsq_peek_result2_pid = -1;
|
||||
long dsq_peek_result2_expected = -1;
|
||||
int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */
|
||||
int real_dsq_id = 1235; /* DSQ for normal operation */
|
||||
int enqueue_count = -1;
|
||||
int dispatch_count = -1;
|
||||
bool debug_ksym_exists;
|
||||
|
||||
/* DSQ pool for stress testing */
|
||||
int dsq_pool_base_id = 2000;
|
||||
int phase1_complete = -1;
|
||||
long total_peek_attempts = -1;
|
||||
long successful_peeks = -1;
|
||||
|
||||
/* BPF map for sharing peek results with userspace */
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__uint(max_entries, MAX_SAMPLES);
|
||||
__type(key, u32);
|
||||
__type(value, long);
|
||||
} peek_results SEC(".maps");
|
||||
|
||||
static int get_random_dsq_id(void)
|
||||
{
|
||||
u64 time = bpf_ktime_get_ns();
|
||||
|
||||
return dsq_pool_base_id + (time % DSQ_POOL_SIZE);
|
||||
}
|
||||
|
||||
static void record_peek_result(long pid)
|
||||
{
|
||||
u32 slot_key;
|
||||
long *slot_pid_ptr;
|
||||
int ix;
|
||||
|
||||
if (pid <= 0)
|
||||
return;
|
||||
|
||||
/* Find an empty slot or one with the same PID */
|
||||
bpf_for(ix, 0, 10) {
|
||||
slot_key = (pid + ix) % MAX_SAMPLES;
|
||||
slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key);
|
||||
if (!slot_pid_ptr)
|
||||
continue;
|
||||
|
||||
if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) {
|
||||
*slot_pid_ptr = pid;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Scan all DSQs in the pool and try to move a task to local */
|
||||
static int scan_dsq_pool(void)
|
||||
{
|
||||
struct task_struct *task;
|
||||
int moved = 0;
|
||||
int i;
|
||||
|
||||
bpf_for(i, 0, DSQ_POOL_SIZE) {
|
||||
int dsq_id = dsq_pool_base_id + i;
|
||||
|
||||
total_peek_attempts++;
|
||||
|
||||
task = __COMPAT_scx_bpf_dsq_peek(dsq_id);
|
||||
if (task) {
|
||||
successful_peeks++;
|
||||
record_peek_result(task->pid);
|
||||
|
||||
/* Try to move this task to local */
|
||||
if (!moved && scx_bpf_dsq_move_to_local(dsq_id) == 0) {
|
||||
moved = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return moved;
|
||||
}
|
||||
|
||||
/* Struct_ops scheduler for testing DSQ peek operations */
|
||||
void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags)
|
||||
{
|
||||
struct task_struct *peek_result;
|
||||
int last_insert_test_cpu, cpu;
|
||||
|
||||
enqueue_count++;
|
||||
cpu = bpf_get_smp_processor_id();
|
||||
last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu);
|
||||
|
||||
/* Phase 1: Simple insert-then-peek test (only on first task) */
|
||||
if (last_insert_test_cpu == -1) {
|
||||
bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu);
|
||||
|
||||
/* Test 1: Peek empty DSQ - should return NULL */
|
||||
peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
|
||||
dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */
|
||||
|
||||
/* Test 2: Insert task into test DSQ for testing in dispatch callback */
|
||||
dsq_inserted_pid = p->pid;
|
||||
scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags);
|
||||
dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */
|
||||
} else if (!phase1_complete) {
|
||||
/* Still in phase 1, use real DSQ */
|
||||
scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags);
|
||||
} else {
|
||||
/* Phase 2: Random DSQ insertion for stress testing */
|
||||
int random_dsq_id = get_random_dsq_id();
|
||||
|
||||
scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags);
|
||||
}
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev)
|
||||
{
|
||||
dispatch_count++;
|
||||
|
||||
/* Phase 1: Complete the simple peek test if we inserted a task but
|
||||
* haven't tested peek yet
|
||||
*/
|
||||
if (insert_test_cpu == cpu && dsq_peek_result2 == -1) {
|
||||
struct task_struct *peek_result;
|
||||
|
||||
bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu);
|
||||
|
||||
/* Test 3: Peek DSQ after insert - should return the task we inserted */
|
||||
peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
|
||||
/* Store the PID of the peeked task for comparison */
|
||||
dsq_peek_result2 = (long)peek_result;
|
||||
dsq_peek_result2_pid = peek_result ? peek_result->pid : -1;
|
||||
|
||||
/* Now consume the task since we've peeked at it */
|
||||
scx_bpf_dsq_move_to_local(test_dsq_id);
|
||||
|
||||
/* Mark phase 1 as complete */
|
||||
phase1_complete = 1;
|
||||
bpf_printk("Phase 1 complete, starting phase 2 stress testing");
|
||||
} else if (!phase1_complete) {
|
||||
/* Still in phase 1, use real DSQ */
|
||||
scx_bpf_dsq_move_to_local(real_dsq_id);
|
||||
} else {
|
||||
/* Phase 2: Scan all DSQs in the pool and try to move a task */
|
||||
if (!scan_dsq_pool()) {
|
||||
/* No tasks found in DSQ pool, fall back to real DSQ */
|
||||
scx_bpf_dsq_move_to_local(real_dsq_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init)
|
||||
{
|
||||
s32 err;
|
||||
int i;
|
||||
|
||||
/* Always set debug values so we can see which version we're using */
|
||||
debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0;
|
||||
|
||||
/* Initialize state first */
|
||||
insert_test_cpu = -1;
|
||||
enqueue_count = 0;
|
||||
dispatch_count = 0;
|
||||
phase1_complete = 0;
|
||||
total_peek_attempts = 0;
|
||||
successful_peeks = 0;
|
||||
|
||||
/* Create the test and real DSQs */
|
||||
err = scx_bpf_create_dsq(test_dsq_id, -1);
|
||||
if (err) {
|
||||
scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
|
||||
return err;
|
||||
}
|
||||
err = scx_bpf_create_dsq(real_dsq_id, -1);
|
||||
if (err) {
|
||||
scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Create the DSQ pool for stress testing */
|
||||
bpf_for(i, 0, DSQ_POOL_SIZE) {
|
||||
int dsq_id = dsq_pool_base_id + i;
|
||||
|
||||
err = scx_bpf_create_dsq(dsq_id, -1);
|
||||
if (err) {
|
||||
scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
/* Initialize the peek results map */
|
||||
bpf_for(i, 0, MAX_SAMPLES) {
|
||||
u32 key = i;
|
||||
long pid = -1;
|
||||
|
||||
bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* Destroy the primary DSQs */
|
||||
scx_bpf_destroy_dsq(test_dsq_id);
|
||||
scx_bpf_destroy_dsq(real_dsq_id);
|
||||
|
||||
/* Destroy the DSQ pool */
|
||||
bpf_for(i, 0, DSQ_POOL_SIZE) {
|
||||
int dsq_id = dsq_pool_base_id + i;
|
||||
|
||||
scx_bpf_destroy_dsq(dsq_id);
|
||||
}
|
||||
|
||||
UEI_RECORD(uei, ei);
|
||||
}
|
||||
|
||||
SEC(".struct_ops.link")
|
||||
struct sched_ext_ops peek_dsq_ops = {
|
||||
.enqueue = (void *)peek_dsq_enqueue,
|
||||
.dispatch = (void *)peek_dsq_dispatch,
|
||||
.init = (void *)peek_dsq_init,
|
||||
.exit = (void *)peek_dsq_exit,
|
||||
.name = "peek_dsq",
|
||||
};
|
||||
|
|
@ -0,0 +1,224 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Test for DSQ operations including create, destroy, and peek operations.
|
||||
*
|
||||
* Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
|
||||
* Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
|
||||
*/
|
||||
#include <bpf/bpf.h>
|
||||
#include <scx/common.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
#include <pthread.h>
|
||||
#include <string.h>
|
||||
#include <sched.h>
|
||||
#include "peek_dsq.bpf.skel.h"
|
||||
#include "scx_test.h"
|
||||
|
||||
#define NUM_WORKERS 4
|
||||
|
||||
static bool workload_running = true;
|
||||
static pthread_t workload_threads[NUM_WORKERS];
|
||||
|
||||
/**
|
||||
* Background workload thread that sleeps and wakes rapidly to exercise
|
||||
* the scheduler's enqueue operations and ensure DSQ operations get tested.
|
||||
*/
|
||||
static void *workload_thread_fn(void *arg)
|
||||
{
|
||||
while (workload_running) {
|
||||
/* Sleep for a very short time to trigger scheduler activity */
|
||||
usleep(1000); /* 1ms sleep */
|
||||
/* Yield to ensure we go through the scheduler */
|
||||
sched_yield();
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static enum scx_test_status setup(void **ctx)
|
||||
{
|
||||
struct peek_dsq *skel;
|
||||
|
||||
skel = peek_dsq__open();
|
||||
SCX_FAIL_IF(!skel, "Failed to open");
|
||||
SCX_ENUM_INIT(skel);
|
||||
SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel");
|
||||
|
||||
*ctx = skel;
|
||||
|
||||
return SCX_TEST_PASS;
|
||||
}
|
||||
|
||||
static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name)
|
||||
{
|
||||
long count = 0;
|
||||
|
||||
printf("Observed %s DSQ peek pids:\n", dsq_name);
|
||||
for (int i = 0; i < max_samples; i++) {
|
||||
long pid;
|
||||
int err;
|
||||
|
||||
err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid);
|
||||
if (err == 0) {
|
||||
if (pid == 0) {
|
||||
printf(" Sample %d: NULL peek\n", i);
|
||||
} else if (pid > 0) {
|
||||
printf(" Sample %d: pid %ld\n", i, pid);
|
||||
count++;
|
||||
}
|
||||
} else {
|
||||
printf(" Sample %d: error reading pid (err=%d)\n", i, err);
|
||||
}
|
||||
}
|
||||
printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name);
|
||||
return count;
|
||||
}
|
||||
|
||||
static enum scx_test_status run(void *ctx)
|
||||
{
|
||||
struct peek_dsq *skel = ctx;
|
||||
bool failed = false;
|
||||
int seconds = 3;
|
||||
int err;
|
||||
|
||||
/* Enable the scheduler to test DSQ operations */
|
||||
printf("Enabling scheduler to test DSQ insert operations...\n");
|
||||
|
||||
struct bpf_link *link =
|
||||
bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops);
|
||||
|
||||
if (!link) {
|
||||
SCX_ERR("Failed to attach struct_ops");
|
||||
return SCX_TEST_FAIL;
|
||||
}
|
||||
|
||||
printf("Starting %d background workload threads...\n", NUM_WORKERS);
|
||||
workload_running = true;
|
||||
for (int i = 0; i < NUM_WORKERS; i++) {
|
||||
err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL);
|
||||
if (err) {
|
||||
SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err));
|
||||
/* Stop already created threads */
|
||||
workload_running = false;
|
||||
for (int j = 0; j < i; j++)
|
||||
pthread_join(workload_threads[j], NULL);
|
||||
bpf_link__destroy(link);
|
||||
return SCX_TEST_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
printf("Waiting for enqueue events.\n");
|
||||
sleep(seconds);
|
||||
while (skel->data->enqueue_count <= 0) {
|
||||
printf(".");
|
||||
fflush(stdout);
|
||||
sleep(1);
|
||||
seconds++;
|
||||
if (seconds >= 30) {
|
||||
printf("\n\u2717 Timeout waiting for enqueue events\n");
|
||||
/* Stop workload threads and cleanup */
|
||||
workload_running = false;
|
||||
for (int i = 0; i < NUM_WORKERS; i++)
|
||||
pthread_join(workload_threads[i], NULL);
|
||||
bpf_link__destroy(link);
|
||||
return SCX_TEST_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
workload_running = false;
|
||||
for (int i = 0; i < NUM_WORKERS; i++) {
|
||||
err = pthread_join(workload_threads[i], NULL);
|
||||
if (err) {
|
||||
SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err));
|
||||
bpf_link__destroy(link);
|
||||
return SCX_TEST_FAIL;
|
||||
}
|
||||
}
|
||||
printf("Background workload threads stopped.\n");
|
||||
|
||||
SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
|
||||
|
||||
/* Detach the scheduler */
|
||||
bpf_link__destroy(link);
|
||||
|
||||
printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds,
|
||||
skel->data->enqueue_count, skel->data->dispatch_count);
|
||||
printf("Debug: ksym_exists=%d\n",
|
||||
skel->bss->debug_ksym_exists);
|
||||
|
||||
/* Check DSQ insert result */
|
||||
printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu);
|
||||
if (skel->data->insert_test_cpu != -1)
|
||||
printf("\u2713 DSQ insert succeeded !\n");
|
||||
else {
|
||||
printf("\u2717 DSQ insert failed or not attempted\n");
|
||||
failed = true;
|
||||
}
|
||||
|
||||
/* Check DSQ peek results */
|
||||
printf(" DSQ peek result 1 (before insert): %d\n",
|
||||
skel->data->dsq_peek_result1);
|
||||
if (skel->data->dsq_peek_result1 == 0)
|
||||
printf("\u2713 DSQ peek verification success: peek returned NULL!\n");
|
||||
else {
|
||||
printf("\u2717 DSQ peek verification failed\n");
|
||||
failed = true;
|
||||
}
|
||||
|
||||
printf(" DSQ peek result 2 (after insert): %ld\n",
|
||||
skel->data->dsq_peek_result2);
|
||||
printf(" DSQ peek result 2, expected: %ld\n",
|
||||
skel->data->dsq_peek_result2_expected);
|
||||
if (skel->data->dsq_peek_result2 ==
|
||||
skel->data->dsq_peek_result2_expected)
|
||||
printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n");
|
||||
else {
|
||||
printf("\u2717 DSQ peek verification failed\n");
|
||||
failed = true;
|
||||
}
|
||||
|
||||
printf(" Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid);
|
||||
printf(" DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid);
|
||||
|
||||
int pid_count;
|
||||
|
||||
pid_count = print_observed_pids(skel->maps.peek_results,
|
||||
skel->data->max_samples, "DSQ pool");
|
||||
printf("Total non-null peek observations: %ld out of %ld\n",
|
||||
skel->data->successful_peeks, skel->data->total_peek_attempts);
|
||||
|
||||
if (skel->bss->debug_ksym_exists && pid_count == 0) {
|
||||
printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n");
|
||||
failed = true;
|
||||
}
|
||||
if (skel->bss->debug_ksym_exists && pid_count > 0)
|
||||
printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n");
|
||||
|
||||
if (failed)
|
||||
return SCX_TEST_FAIL;
|
||||
else
|
||||
return SCX_TEST_PASS;
|
||||
}
|
||||
|
||||
static void cleanup(void *ctx)
|
||||
{
|
||||
struct peek_dsq *skel = ctx;
|
||||
|
||||
if (workload_running) {
|
||||
workload_running = false;
|
||||
for (int i = 0; i < NUM_WORKERS; i++)
|
||||
pthread_join(workload_threads[i], NULL);
|
||||
}
|
||||
|
||||
peek_dsq__destroy(skel);
|
||||
}
|
||||
|
||||
struct scx_test peek_dsq = {
|
||||
.name = "peek_dsq",
|
||||
.description =
|
||||
"Test DSQ create/destroy operations and future peek functionality",
|
||||
.setup = setup,
|
||||
.run = run,
|
||||
.cleanup = cleanup,
|
||||
};
|
||||
REGISTER_SCX_TEST(&peek_dsq)
|
||||
Loading…
Reference in New Issue