mirror of https://github.com/torvalds/linux.git
sched/mmcid: Switch over to the new mechanism
Now that all pieces are in place, change the implementations of sched_mm_cid_fork() and sched_mm_cid_exit() to adhere to the new strict ownership scheme and switch context_switch() over to use the new mm_cid_schedin() functionality. The common case is that there is no mode change required, which makes fork() and exit() just update the user count and the constraints. In case that a new user would exceed the CID space limit the fork() context handles the transition to per CPU mode with mm::mm_cid::mutex held. exit() handles the transition back to per task mode when the user count drops below the switch back threshold. fork() might also be forced to handle a deferred switch back to per task mode, when a affinity change increased the number of allowed CPUs enough. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Link: https://patch.msgid.link/20251119172550.280380631@linutronix.de
This commit is contained in:
parent
9da6ccbcea
commit
653fda7ae7
|
|
@ -84,24 +84,6 @@ static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
|
|||
t->rseq.event.ids_changed = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Invoked from switch_mm_cid() in context switch when the task gets a MM
|
||||
* CID assigned.
|
||||
*
|
||||
* This does not raise TIF_NOTIFY_RESUME as that happens in
|
||||
* rseq_sched_switch_event().
|
||||
*/
|
||||
static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid)
|
||||
{
|
||||
/*
|
||||
* Requires a comparison as the switch_mm_cid() code does not
|
||||
* provide a conditional for it readily. So avoid excessive updates
|
||||
* when nothing changes.
|
||||
*/
|
||||
if (t->rseq.ids.mm_cid != cid)
|
||||
t->rseq.event.ids_changed = true;
|
||||
}
|
||||
|
||||
/* Enforce a full update after RSEQ registration and when execve() failed */
|
||||
static inline void rseq_force_update(void)
|
||||
{
|
||||
|
|
@ -169,7 +151,6 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
|
|||
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
|
||||
static inline void rseq_sched_switch_event(struct task_struct *t) { }
|
||||
static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
|
||||
static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { }
|
||||
static inline void rseq_force_update(void) { }
|
||||
static inline void rseq_virt_userspace_exit(void) { }
|
||||
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
|
||||
|
|
|
|||
|
|
@ -101,18 +101,18 @@ struct rseq_data { };
|
|||
/**
|
||||
* struct sched_mm_cid - Storage for per task MM CID data
|
||||
* @active: MM CID is active for the task
|
||||
* @cid: The CID associated to the task
|
||||
* @last_cid: The last CID associated to the task
|
||||
* @cid: The CID associated to the task either permanently or
|
||||
* borrowed from the CPU
|
||||
*/
|
||||
struct sched_mm_cid {
|
||||
unsigned int active;
|
||||
unsigned int cid;
|
||||
unsigned int last_cid;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct mm_cid_pcpu - Storage for per CPU MM_CID data
|
||||
* @cid: The CID associated to the CPU
|
||||
* @cid: The CID associated to the CPU either permanently or
|
||||
* while a task with a CID is running
|
||||
*/
|
||||
struct mm_cid_pcpu {
|
||||
unsigned int cid;
|
||||
|
|
|
|||
|
|
@ -956,7 +956,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
|
|||
|
||||
#ifdef CONFIG_SCHED_MM_CID
|
||||
tsk->mm_cid.cid = MM_CID_UNSET;
|
||||
tsk->mm_cid.last_cid = MM_CID_UNSET;
|
||||
tsk->mm_cid.active = 0;
|
||||
#endif
|
||||
return tsk;
|
||||
|
|
|
|||
|
|
@ -5307,7 +5307,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
|
|||
}
|
||||
}
|
||||
|
||||
switch_mm_cid(prev, next);
|
||||
mm_cid_switch_to(prev, next);
|
||||
|
||||
/*
|
||||
* Tell rseq that the task was scheduled in. Must be after
|
||||
|
|
@ -10624,7 +10624,7 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
|
|||
return true;
|
||||
}
|
||||
|
||||
static void __maybe_unused mm_cid_fixup_tasks_to_cpus(void)
|
||||
static void mm_cid_fixup_tasks_to_cpus(void)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct task_struct *p, *t;
|
||||
|
|
@ -10674,25 +10674,81 @@ static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
|
|||
void sched_mm_cid_fork(struct task_struct *t)
|
||||
{
|
||||
struct mm_struct *mm = t->mm;
|
||||
bool percpu;
|
||||
|
||||
WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
|
||||
|
||||
guard(mutex)(&mm->mm_cid.mutex);
|
||||
scoped_guard(raw_spinlock, &mm->mm_cid.lock) {
|
||||
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
|
||||
struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
|
||||
|
||||
/* First user ? */
|
||||
if (!mm->mm_cid.users) {
|
||||
sched_mm_cid_add_user(t, mm);
|
||||
/* Preset last_cid for mm_cid_select() */
|
||||
t->mm_cid.last_cid = mm->mm_cid.max_cids - 1;
|
||||
t->mm_cid.cid = mm_get_cid(mm);
|
||||
/* Required for execve() */
|
||||
pcp->cid = t->mm_cid.cid;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!sched_mm_cid_add_user(t, mm)) {
|
||||
if (!mm->mm_cid.percpu)
|
||||
t->mm_cid.cid = mm_get_cid(mm);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Handle the mode change and transfer current's CID */
|
||||
percpu = !!mm->mm_cid.percpu;
|
||||
if (!percpu)
|
||||
mm_cid_transit_to_task(current, pcp);
|
||||
else
|
||||
mm_cid_transfer_to_cpu(current, pcp);
|
||||
}
|
||||
|
||||
if (percpu) {
|
||||
mm_cid_fixup_tasks_to_cpus();
|
||||
} else {
|
||||
mm_cid_fixup_cpus_to_tasks(mm);
|
||||
t->mm_cid.cid = mm_get_cid(mm);
|
||||
}
|
||||
}
|
||||
|
||||
static bool sched_mm_cid_remove_user(struct task_struct *t)
|
||||
{
|
||||
t->mm_cid.active = 0;
|
||||
scoped_guard(preempt) {
|
||||
/* Clear the transition bit */
|
||||
t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
|
||||
mm_unset_cid_on_task(t);
|
||||
}
|
||||
t->mm->mm_cid.users--;
|
||||
return mm_update_max_cids(t->mm);
|
||||
}
|
||||
|
||||
static bool __sched_mm_cid_exit(struct task_struct *t)
|
||||
{
|
||||
struct mm_struct *mm = t->mm;
|
||||
|
||||
if (!sched_mm_cid_remove_user(t))
|
||||
return false;
|
||||
/*
|
||||
* Contrary to fork() this only deals with a switch back to per
|
||||
* task mode either because the above decreased users or an
|
||||
* affinity change increased the number of allowed CPUs and the
|
||||
* deferred fixup did not run yet.
|
||||
*/
|
||||
if (WARN_ON_ONCE(mm->mm_cid.percpu))
|
||||
return false;
|
||||
/*
|
||||
* A failed fork(2) cleanup never gets here, so @current must have
|
||||
* the same MM as @t. That's true for exit() and the failed
|
||||
* pthread_create() cleanup case.
|
||||
*/
|
||||
if (WARN_ON_ONCE(current->mm != mm))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* When a task exits, the MM CID held by the task is not longer required as
|
||||
* the task cannot return to user space.
|
||||
|
|
@ -10703,10 +10759,43 @@ void sched_mm_cid_exit(struct task_struct *t)
|
|||
|
||||
if (!mm || !t->mm_cid.active)
|
||||
return;
|
||||
|
||||
guard(mutex)(&mm->mm_cid.mutex);
|
||||
scoped_guard(raw_spinlock, &mm->mm_cid.lock)
|
||||
/*
|
||||
* Ensure that only one instance is doing MM CID operations within
|
||||
* a MM. The common case is uncontended. The rare fixup case adds
|
||||
* some overhead.
|
||||
*/
|
||||
scoped_guard(mutex, &mm->mm_cid.mutex) {
|
||||
/* mm_cid::mutex is sufficient to protect mm_cid::users */
|
||||
if (likely(mm->mm_cid.users > 1)) {
|
||||
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
|
||||
if (!__sched_mm_cid_exit(t))
|
||||
return;
|
||||
/* Mode change required. Transfer currents CID */
|
||||
mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
|
||||
}
|
||||
mm_cid_fixup_cpus_to_tasks(mm);
|
||||
return;
|
||||
}
|
||||
/* Last user */
|
||||
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
|
||||
/* Required across execve() */
|
||||
if (t == current)
|
||||
mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
|
||||
/* Ignore mode change. There is nothing to do. */
|
||||
sched_mm_cid_remove_user(t);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* As this is the last user (execve(), process exit or failed
|
||||
* fork(2)) there is no concurrency anymore.
|
||||
*
|
||||
* Synchronize eventually pending work to ensure that there are no
|
||||
* dangling references left. @t->mm_cid.users is zero so nothing
|
||||
* can queue this work anymore.
|
||||
*/
|
||||
irq_work_sync(&mm->mm_cid.irq_work);
|
||||
cancel_work_sync(&mm->mm_cid.work);
|
||||
}
|
||||
|
||||
/* Deactivate MM CID allocation across execve() */
|
||||
|
|
@ -10719,18 +10808,12 @@ void sched_mm_cid_before_execve(struct task_struct *t)
|
|||
void sched_mm_cid_after_execve(struct task_struct *t)
|
||||
{
|
||||
sched_mm_cid_fork(t);
|
||||
guard(preempt)();
|
||||
mm_cid_select(t);
|
||||
}
|
||||
|
||||
static void mm_cid_work_fn(struct work_struct *work)
|
||||
{
|
||||
struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
|
||||
|
||||
/* Make it compile, but not functional yet */
|
||||
if (!IS_ENABLED(CONFIG_NEW_MM_CID))
|
||||
return;
|
||||
|
||||
guard(mutex)(&mm->mm_cid.mutex);
|
||||
/* Did the last user task exit already? */
|
||||
if (!mm->mm_cid.users)
|
||||
|
|
|
|||
|
|
@ -3745,83 +3745,7 @@ static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct
|
|||
mm_cid_schedin(next);
|
||||
}
|
||||
|
||||
/* Active implementation */
|
||||
static inline void init_sched_mm_cid(struct task_struct *t)
|
||||
{
|
||||
struct mm_struct *mm = t->mm;
|
||||
unsigned int max_cid;
|
||||
|
||||
if (!mm)
|
||||
return;
|
||||
|
||||
/* Preset last_mm_cid */
|
||||
max_cid = min_t(int, READ_ONCE(mm->mm_cid.nr_cpus_allowed), atomic_read(&mm->mm_users));
|
||||
t->mm_cid.last_cid = max_cid - 1;
|
||||
}
|
||||
|
||||
static inline bool __mm_cid_get(struct task_struct *t, unsigned int cid, unsigned int max_cids)
|
||||
{
|
||||
struct mm_struct *mm = t->mm;
|
||||
|
||||
if (cid >= max_cids)
|
||||
return false;
|
||||
if (test_and_set_bit(cid, mm_cidmask(mm)))
|
||||
return false;
|
||||
t->mm_cid.cid = t->mm_cid.last_cid = cid;
|
||||
__this_cpu_write(mm->mm_cid.pcpu->cid, cid);
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool mm_cid_get(struct task_struct *t)
|
||||
{
|
||||
struct mm_struct *mm = t->mm;
|
||||
unsigned int max_cids;
|
||||
|
||||
max_cids = READ_ONCE(mm->mm_cid.max_cids);
|
||||
|
||||
/* Try to reuse the last CID of this task */
|
||||
if (__mm_cid_get(t, t->mm_cid.last_cid, max_cids))
|
||||
return true;
|
||||
|
||||
/* Try to reuse the last CID of this mm on this CPU */
|
||||
if (__mm_cid_get(t, __this_cpu_read(mm->mm_cid.pcpu->cid), max_cids))
|
||||
return true;
|
||||
|
||||
/* Try the first zero bit in the cidmask. */
|
||||
return __mm_cid_get(t, find_first_zero_bit(mm_cidmask(mm), num_possible_cpus()), max_cids);
|
||||
}
|
||||
|
||||
static inline void mm_cid_select(struct task_struct *t)
|
||||
{
|
||||
/*
|
||||
* mm_cid_get() can fail when the maximum CID, which is determined
|
||||
* by min(mm->nr_cpus_allowed, mm->mm_users) changes concurrently.
|
||||
* That's a transient failure as there cannot be more tasks
|
||||
* concurrently on a CPU (or about to be scheduled in) than that.
|
||||
*/
|
||||
for (;;) {
|
||||
if (mm_cid_get(t))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
|
||||
{
|
||||
if (prev->mm_cid.active) {
|
||||
if (prev->mm_cid.cid != MM_CID_UNSET)
|
||||
clear_bit(prev->mm_cid.cid, mm_cidmask(prev->mm));
|
||||
prev->mm_cid.cid = MM_CID_UNSET;
|
||||
}
|
||||
|
||||
if (next->mm_cid.active) {
|
||||
mm_cid_select(next);
|
||||
rseq_sched_set_task_mm_cid(next, next->mm_cid.cid);
|
||||
}
|
||||
}
|
||||
|
||||
#else /* !CONFIG_SCHED_MM_CID: */
|
||||
static inline void mm_cid_select(struct task_struct *t) { }
|
||||
static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
|
||||
static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
|
||||
#endif /* !CONFIG_SCHED_MM_CID */
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue