Update to the time/timers core:

- Prevent a thundering herd problem when the timekeeper CPU is delayed
     and a large number of CPUs compete to acquire jiffies_lock to do the
     update. Limit it to one CPU with a separate "uncontended" atomic
     variable.
 
   - A set of improvements for the timer migration mechanism:
 
     - Support imbalanced NUMA trees correctly
 
     - Support dynamic exclusion of CPUs from the migrator duty to allow the
       cpuset/isolation mechanism to exclude them from handling timers of
       remote idle CPUs.
 
    - The usual small updates, cleanups and enhancements
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmks7doTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoaxrD/40nxx+8cEXsVbVLIkP2PQbd2Y8+7sk
 YbNu/Cb7j7Bg7R8YIs4p5GHk+7Yt/hNsW77SmbAzRPUyYYG6L3bUYlBa3yQlvIuo
 xRPbzGA+RJies9skIGHbQ8z6ig1zUASRJPcBYiuaVIAuQhCfLNc4Nii9cEWtjZ24
 +5gfRwV+vy74ArWwRkwaGejDK1tav+gd62OkFQZC8WtjQ08ozGZ6VBJNg7nYq/gH
 FYO1rH2tQ/ZyjlO/x5NF8gFcjYD8iv5PDp8oH35MPx+XTdDccf0G3QB7ug0ffVdV
 b4gA6lZTAmpsu/NHb6ByN4i/kf3wf8la/i+EaAh/Ov7NW078gunvVKVA7jStcbBl
 ZgG5SRHiKRvQF/WXLGVQAnilRDZwRuS0nmJlqfExa44v23l5o3768RwdRYwQlv8g
 X5KSRl0jlVgVtZHgNBlZtgX9+rnQSr9sB5sVGBP2a6a1WhVXQV/2kp0wjdnU0mPw
 jLCnSdsHqBlSf9V7O/na823WCnBFb7blrLBXUoSbHBnICqtVFzhE1kBXWw3S7Kqh
 CiaWM+S4WfR0HRnUlWMTS8BZ82MgiDnd7nGUXWwXBbdqWmoj/9CoU6SZRjbMBkzi
 EY1XvmoYf6eSzdxfydI1hFi0/bbb8K9umHQlrpW3HeN9uXnVc0/+TroVPLuaKUdi
 53ClqXjzE+CpJg==
 =lQKn
 -----END PGP SIGNATURE-----

Merge tag 'timers-core-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull timer core updates from Thomas Gleixner:

 - Prevent a thundering herd problem when the timekeeper CPU is delayed
   and a large number of CPUs compete to acquire jiffies_lock to do the
   update. Limit it to one CPU with a separate "uncontended" atomic
   variable.

 - A set of improvements for the timer migration mechanism:

     - Support imbalanced NUMA trees correctly

     - Support dynamic exclusion of CPUs from the migrator duty to allow
       the cpuset/isolation mechanism to exclude them from handling
       timers of remote idle CPUs

 - The usual small updates, cleanups and enhancements

* tag 'timers-core-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  timers/migration: Exclude isolated cpus from hierarchy
  cpumask: Add initialiser to use cleanup helpers
  sched/isolation: Force housekeeping if isolcpus and nohz_full don't leave any
  cgroup/cpuset: Rename update_unbound_workqueue_cpumask() to update_isolation_cpumasks()
  timers/migration: Use scoped_guard on available flag set/clear
  timers/migration: Add mask for CPUs available in the hierarchy
  timers/migration: Rename 'online' bit to 'available'
  selftests/timers/nanosleep: Add tests for return of remaining time
  selftests/timers: Clean up kernel version check in posix_timers
  time: Fix a few typos in time[r] related code comments
  time: tick-oneshot: Add missing Return and parameter descriptions to kernel-doc
  hrtimer: Store time as ktime_t in restart block
  timers/migration: Remove dead code handling idle CPU checking for remote timers
  timers/migration: Remove unused "cpu" parameter from tmigr_get_group()
  timers/migration: Assert that hotplug preparing CPU is part of stable active hierarchy
  timers/migration: Fix imbalanced NUMA trees
  timers/migration: Remove locking on group connection
  timers/migration: Convert "while" loops to use "for"
  tick/sched: Limit non-timekeeper CPUs calling jiffies update
This commit is contained in:
Linus Torvalds 2025-12-02 09:58:33 -08:00
commit d42e504a55
16 changed files with 510 additions and 203 deletions

View File

@ -1022,6 +1022,7 @@ static __always_inline unsigned int cpumask_size(void)
#define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x)
#define __cpumask_var_read_mostly __read_mostly #define __cpumask_var_read_mostly __read_mostly
#define CPUMASK_VAR_NULL NULL
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);
@ -1068,6 +1069,7 @@ static __always_inline bool cpumask_available(cpumask_var_t mask)
#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
#define __cpumask_var_read_mostly #define __cpumask_var_read_mostly
#define CPUMASK_VAR_NULL {}
static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{ {

View File

@ -68,7 +68,7 @@ void usleep_range_state(unsigned long min, unsigned long max,
* @min: Minimum time in microseconds to sleep * @min: Minimum time in microseconds to sleep
* @max: Maximum time in microseconds to sleep * @max: Maximum time in microseconds to sleep
* *
* For basic information please refere to usleep_range_state(). * For basic information please refer to usleep_range_state().
* *
* The task will be in the state TASK_UNINTERRUPTIBLE during the sleep. * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep.
*/ */
@ -82,10 +82,10 @@ static inline void usleep_range(unsigned long min, unsigned long max)
* @min: Minimum time in microseconds to sleep * @min: Minimum time in microseconds to sleep
* @max: Maximum time in microseconds to sleep * @max: Maximum time in microseconds to sleep
* *
* For basic information please refere to usleep_range_state(). * For basic information please refer to usleep_range_state().
* *
* The sleeping task has the state TASK_IDLE during the sleep to prevent * The sleeping task has the state TASK_IDLE during the sleep to prevent
* contribution to the load avarage. * contribution to the load average.
*/ */
static inline void usleep_range_idle(unsigned long min, unsigned long max) static inline void usleep_range_idle(unsigned long min, unsigned long max)
{ {
@ -96,7 +96,7 @@ static inline void usleep_range_idle(unsigned long min, unsigned long max)
* ssleep - wrapper for seconds around msleep * ssleep - wrapper for seconds around msleep
* @seconds: Requested sleep duration in seconds * @seconds: Requested sleep duration in seconds
* *
* Please refere to msleep() for detailed information. * Please refer to msleep() for detailed information.
*/ */
static inline void ssleep(unsigned int seconds) static inline void ssleep(unsigned int seconds)
{ {

View File

@ -43,7 +43,7 @@ struct restart_block {
struct __kernel_timespec __user *rmtp; struct __kernel_timespec __user *rmtp;
struct old_timespec32 __user *compat_rmtp; struct old_timespec32 __user *compat_rmtp;
}; };
u64 expires; ktime_t expires;
} nanosleep; } nanosleep;
/* For poll */ /* For poll */
struct { struct {

View File

@ -188,4 +188,13 @@ int timers_dead_cpu(unsigned int cpu);
#define timers_dead_cpu NULL #define timers_dead_cpu NULL
#endif #endif
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask);
#else
static inline int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
{
return 0;
}
#endif
#endif #endif

View File

@ -173,14 +173,14 @@ DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_active,
TP_ARGS(tmc) TP_ARGS(tmc)
); );
DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_online, DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_available,
TP_PROTO(struct tmigr_cpu *tmc), TP_PROTO(struct tmigr_cpu *tmc),
TP_ARGS(tmc) TP_ARGS(tmc)
); );
DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_offline, DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_unavailable,
TP_PROTO(struct tmigr_cpu *tmc), TP_PROTO(struct tmigr_cpu *tmc),

View File

@ -1391,7 +1391,7 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
return isolcpus_updated; return isolcpus_updated;
} }
static void update_unbound_workqueue_cpumask(bool isolcpus_updated) static void update_isolation_cpumasks(bool isolcpus_updated)
{ {
int ret; int ret;
@ -1402,6 +1402,9 @@ static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
ret = workqueue_unbound_exclude_cpumask(isolated_cpus); ret = workqueue_unbound_exclude_cpumask(isolated_cpus);
WARN_ON_ONCE(ret < 0); WARN_ON_ONCE(ret < 0);
ret = tmigr_isolated_exclude_cpumask(isolated_cpus);
WARN_ON_ONCE(ret < 0);
} }
/** /**
@ -1555,7 +1558,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
list_add(&cs->remote_sibling, &remote_children); list_add(&cs->remote_sibling, &remote_children);
cpumask_copy(cs->effective_xcpus, tmp->new_cpus); cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
spin_unlock_irq(&callback_lock); spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated); update_isolation_cpumasks(isolcpus_updated);
cpuset_force_rebuild(); cpuset_force_rebuild();
cs->prs_err = 0; cs->prs_err = 0;
@ -1596,7 +1599,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
compute_excpus(cs, cs->effective_xcpus); compute_excpus(cs, cs->effective_xcpus);
reset_partition_data(cs); reset_partition_data(cs);
spin_unlock_irq(&callback_lock); spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated); update_isolation_cpumasks(isolcpus_updated);
cpuset_force_rebuild(); cpuset_force_rebuild();
/* /*
@ -1665,7 +1668,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
if (xcpus) if (xcpus)
cpumask_copy(cs->exclusive_cpus, xcpus); cpumask_copy(cs->exclusive_cpus, xcpus);
spin_unlock_irq(&callback_lock); spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated); update_isolation_cpumasks(isolcpus_updated);
if (adding || deleting) if (adding || deleting)
cpuset_force_rebuild(); cpuset_force_rebuild();
@ -2023,7 +2026,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
WARN_ON_ONCE(parent->nr_subparts < 0); WARN_ON_ONCE(parent->nr_subparts < 0);
} }
spin_unlock_irq(&callback_lock); spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated); update_isolation_cpumasks(isolcpus_updated);
if ((old_prs != new_prs) && (cmd == partcmd_update)) if ((old_prs != new_prs) && (cmd == partcmd_update))
update_partition_exclusive_flag(cs, new_prs); update_partition_exclusive_flag(cs, new_prs);
@ -3043,7 +3046,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
else if (isolcpus_updated) else if (isolcpus_updated)
isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock); spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated); update_isolation_cpumasks(isolcpus_updated);
/* Force update if switching back to member & update effective_xcpus */ /* Force update if switching back to member & update effective_xcpus */
update_cpumasks_hier(cs, &tmpmask, !new_prs); update_cpumasks_hier(cs, &tmpmask, !new_prs);

View File

@ -167,6 +167,29 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
} }
} }
/*
* Check the combination of nohz_full and isolcpus=domain,
* necessary to avoid problems with the timer migration
* hierarchy. managed_irq is ignored by this check since it
* isn't considered in the timer migration logic.
*/
iter_flags = housekeeping.flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
type = find_first_bit(&iter_flags, HK_TYPE_MAX);
/*
* Pass the check if none of these flags were previously set or
* are not in the current selection.
*/
iter_flags = flags & (HK_FLAG_KERNEL_NOISE | HK_FLAG_DOMAIN);
first_cpu = (type == HK_TYPE_MAX || !iter_flags) ? 0 :
cpumask_first_and_and(cpu_present_mask,
housekeeping_staging, housekeeping.cpumasks[type]);
if (first_cpu >= min(nr_cpu_ids, setup_max_cpus)) {
pr_warn("Housekeeping: must include one present CPU "
"neither in nohz_full= nor in isolcpus=domain, "
"ignoring setting %s\n", str);
goto free_housekeeping_staging;
}
iter_flags = flags & ~housekeeping.flags; iter_flags = flags & ~housekeeping.flags;
for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)

View File

@ -2145,7 +2145,7 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
int ret; int ret;
hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); hrtimer_set_expires(&t.timer, restart->nanosleep.expires);
ret = do_nanosleep(&t, HRTIMER_MODE_ABS); ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
destroy_hrtimer_on_stack(&t.timer); destroy_hrtimer_on_stack(&t.timer);
return ret; return ret;
@ -2172,7 +2172,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
restart = &current->restart_block; restart = &current->restart_block;
restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); restart->nanosleep.expires = hrtimer_get_expires(&t.timer);
set_restart_fn(restart, hrtimer_nanosleep_restart); set_restart_fn(restart, hrtimer_nanosleep_restart);
out: out:
destroy_hrtimer_on_stack(&t.timer); destroy_hrtimer_on_stack(&t.timer);

View File

@ -1557,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
* Report back to the user the time still remaining. * Report back to the user the time still remaining.
*/ */
restart = &current->restart_block; restart = &current->restart_block;
restart->nanosleep.expires = expires; restart->nanosleep.expires = ns_to_ktime(expires);
if (restart->nanosleep.type != TT_NONE) if (restart->nanosleep.type != TT_NONE)
error = nanosleep_copyout(restart, &it.it_value); error = nanosleep_copyout(restart, &it.it_value);
} }
@ -1599,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
clockid_t which_clock = restart_block->nanosleep.clockid; clockid_t which_clock = restart_block->nanosleep.clockid;
struct timespec64 t; struct timespec64 t;
t = ns_to_timespec64(restart_block->nanosleep.expires); t = ktime_to_timespec64(restart_block->nanosleep.expires);
return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
} }

View File

@ -1242,7 +1242,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
* sys_clock_settime(). The kernel internal timekeeping is always using * sys_clock_settime(). The kernel internal timekeeping is always using
* nanoseconds precision independent of the clocksource device which is * nanoseconds precision independent of the clocksource device which is
* used to read the time from. The resolution of that device only * used to read the time from. The resolution of that device only
* affects the presicion of the time returned by sys_clock_gettime(). * affects the precision of the time returned by sys_clock_gettime().
* *
* Returns: * Returns:
* 0 Success. @tp contains the resolution * 0 Success. @tp contains the resolution

View File

@ -19,6 +19,10 @@
/** /**
* tick_program_event - program the CPU local timer device for the next event * tick_program_event - program the CPU local timer device for the next event
* @expires: the time at which the next timer event should occur
* @force: flag to force reprograming even if the event time hasn't changed
*
* Return: 0 on success, negative error code on failure
*/ */
int tick_program_event(ktime_t expires, int force) int tick_program_event(ktime_t expires, int force)
{ {
@ -57,6 +61,13 @@ void tick_resume_oneshot(void)
/** /**
* tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
* @newdev: Pointer to the clock event device to configure
* @handler: Function to be called when the event device triggers an interrupt
* @next_event: Initial expiry time for the next event (in ktime)
*
* Configures the specified clock event device for onshot mode,
* assigns the given handler as its event callback, and programs
* the device to trigger at the specified next event time.
*/ */
void tick_setup_oneshot(struct clock_event_device *newdev, void tick_setup_oneshot(struct clock_event_device *newdev,
void (*handler)(struct clock_event_device *), void (*handler)(struct clock_event_device *),
@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
/** /**
* tick_switch_to_oneshot - switch to oneshot mode * tick_switch_to_oneshot - switch to oneshot mode
* @handler: function to call when an event occurs on the tick device
*
* Return: 0 on success, -EINVAL if the tick device is not present,
* not functional, or does not support oneshot mode.
*/ */
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{ {
@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
/** /**
* tick_oneshot_mode_active - check whether the system is in oneshot mode * tick_oneshot_mode_active - check whether the system is in oneshot mode
* *
* returns 1 when either nohz or highres are enabled. otherwise 0. * Return: 1 when either nohz or highres are enabled, otherwise 0.
*/ */
int tick_oneshot_mode_active(void) int tick_oneshot_mode_active(void)
{ {
@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void)
* tick_init_highres - switch to high resolution mode * tick_init_highres - switch to high resolution mode
* *
* Called with interrupts disabled. * Called with interrupts disabled.
*
* Return: 0 on success, -EINVAL if the tick device cannot switch
* to oneshot/high-resolution mode.
*/ */
int tick_init_highres(void) int tick_init_highres(void)
{ {

View File

@ -201,6 +201,27 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts,
ts->flags &= ~flag; ts->flags &= ~flag;
} }
/*
* Allow only one non-timekeeper CPU at a time update jiffies from
* the timer tick.
*
* Returns true if update was run.
*/
static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now)
{
static atomic_t in_progress;
int inp;
inp = atomic_read(&in_progress);
if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1))
return false;
if (ts->last_tick_jiffies == jiffies)
tick_do_update_jiffies64(now);
atomic_set(&in_progress, 0);
return true;
}
#define MAX_STALLED_JIFFIES 5 #define MAX_STALLED_JIFFIES 5
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
@ -239,10 +260,11 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
ts->stalled_jiffies = 0; ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies); ts->last_tick_jiffies = READ_ONCE(jiffies);
} else { } else {
if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) {
tick_do_update_jiffies64(now); if (tick_limited_update_jiffies64(ts, now)) {
ts->stalled_jiffies = 0; ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies); ts->last_tick_jiffies = READ_ONCE(jiffies);
}
} }
} }

View File

@ -10,6 +10,7 @@
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/timerqueue.h> #include <linux/timerqueue.h>
#include <trace/events/ipi.h> #include <trace/events/ipi.h>
#include <linux/sched/isolation.h>
#include "timer_migration.h" #include "timer_migration.h"
#include "tick-internal.h" #include "tick-internal.h"
@ -420,14 +421,54 @@ static struct list_head *tmigr_level_list __read_mostly;
static unsigned int tmigr_hierarchy_levels __read_mostly; static unsigned int tmigr_hierarchy_levels __read_mostly;
static unsigned int tmigr_crossnode_level __read_mostly; static unsigned int tmigr_crossnode_level __read_mostly;
static struct tmigr_group *tmigr_root;
static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
/*
* CPUs available for timer migration.
* Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
* Additionally tmigr_available_mutex serializes set/clear operations with each other.
*/
static cpumask_var_t tmigr_available_cpumask;
static DEFINE_MUTEX(tmigr_available_mutex);
/* Enabled during late initcall */
static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated);
#define TMIGR_NONE 0xFF #define TMIGR_NONE 0xFF
#define BIT_CNT 8 #define BIT_CNT 8
static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc)
{ {
return !(tmc->tmgroup && tmc->online); return !(tmc->tmgroup && tmc->available);
}
/*
* Returns true if @cpu should be excluded from the hierarchy as isolated.
* Domain isolated CPUs don't participate in timer migration, nohz_full CPUs
* are still part of the hierarchy but become idle (from a tick and timer
* migration perspective) when they stop their tick. This lets the timekeeping
* CPU handle their global timers. Marking also isolated CPUs as idle would be
* too costly, hence they are completely excluded from the hierarchy.
* This check is necessary, for instance, to prevent offline isolated CPUs from
* being incorrectly marked as available once getting back online.
*
* This function returns false during early boot and the isolation logic is
* enabled only after isolated CPUs are marked as unavailable at late boot.
* The tick CPU can be isolated at boot, however we cannot mark it as
* unavailable to avoid having no global migrator for the nohz_full CPUs. This
* should be ensured by the callers of this function: implicitly from hotplug
* callbacks and explicitly in tmigr_init_isolation() and
* tmigr_isolated_exclude_cpumask().
*/
static inline bool tmigr_is_isolated(int cpu)
{
if (!static_branch_unlikely(&tmigr_exclude_isolated))
return false;
return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) ||
cpuset_cpu_is_isolated(cpu)) &&
housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE);
} }
/* /*
@ -502,11 +543,6 @@ static bool tmigr_check_lonely(struct tmigr_group *group)
* @now: timer base monotonic * @now: timer base monotonic
* @check: is set if there is the need to handle remote timers; * @check: is set if there is the need to handle remote timers;
* required in tmigr_requires_handle_remote() only * required in tmigr_requires_handle_remote() only
* @tmc_active: this flag indicates, whether the CPU which triggers
* the hierarchy walk is !idle in the timer migration
* hierarchy. When the CPU is idle and the whole hierarchy is
* idle, only the first event of the top level has to be
* considered.
*/ */
struct tmigr_walk { struct tmigr_walk {
u64 nextexp; u64 nextexp;
@ -517,16 +553,13 @@ struct tmigr_walk {
unsigned long basej; unsigned long basej;
u64 now; u64 now;
bool check; bool check;
bool tmc_active;
}; };
typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *);
static void __walk_groups(up_f up, struct tmigr_walk *data, static void __walk_groups_from(up_f up, struct tmigr_walk *data,
struct tmigr_cpu *tmc) struct tmigr_group *child, struct tmigr_group *group)
{ {
struct tmigr_group *child = NULL, *group = tmc->tmgroup;
do { do {
WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels);
@ -544,6 +577,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
} while (group); } while (group);
} }
static void __walk_groups(up_f up, struct tmigr_walk *data,
struct tmigr_cpu *tmc)
{
__walk_groups_from(up, data, NULL, tmc->tmgroup);
}
static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc)
{ {
lockdep_assert_held(&tmc->lock); lockdep_assert_held(&tmc->lock);
@ -708,7 +747,7 @@ void tmigr_cpu_activate(void)
/* /*
* Returns true, if there is nothing to be propagated to the next level * Returns true, if there is nothing to be propagated to the next level
* *
* @data->firstexp is set to expiry of first gobal event of the (top level of * @data->firstexp is set to expiry of first global event of the (top level of
* the) hierarchy, but only when hierarchy is completely idle. * the) hierarchy, but only when hierarchy is completely idle.
* *
* The child and group states need to be read under the lock, to prevent a race * The child and group states need to be read under the lock, to prevent a race
@ -926,7 +965,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
* updated the event takes care when hierarchy is completely * updated the event takes care when hierarchy is completely
* idle. Otherwise the migrator does it as the event is enqueued. * idle. Otherwise the migrator does it as the event is enqueued.
*/ */
if (!tmc->online || tmc->remote || tmc->cpuevt.ignore || if (!tmc->available || tmc->remote || tmc->cpuevt.ignore ||
now < tmc->cpuevt.nextevt.expires) { now < tmc->cpuevt.nextevt.expires) {
raw_spin_unlock_irq(&tmc->lock); raw_spin_unlock_irq(&tmc->lock);
return; return;
@ -973,7 +1012,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
* (See also section "Required event and timerqueue update after a * (See also section "Required event and timerqueue update after a
* remote expiry" in the documentation at the top) * remote expiry" in the documentation at the top)
*/ */
if (!tmc->online || !tmc->idle) { if (!tmc->available || !tmc->idle) {
timer_unlock_remote_bases(cpu); timer_unlock_remote_bases(cpu);
goto unlock; goto unlock;
} }
@ -1113,15 +1152,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
*/ */
if (!tmigr_check_migrator(group, childmask)) if (!tmigr_check_migrator(group, childmask))
return true; return true;
/*
* When there is a parent group and the CPU which triggered the
* hierarchy walk is not active, proceed the walk to reach the top level
* group before reading the next_expiry value.
*/
if (group->parent && !data->tmc_active)
return false;
/* /*
* The lock is required on 32bit architectures to read the variable * The lock is required on 32bit architectures to read the variable
* consistently with a concurrent writer. On 64bit the lock is not * consistently with a concurrent writer. On 64bit the lock is not
@ -1166,7 +1196,6 @@ bool tmigr_requires_handle_remote(void)
data.now = get_jiffies_update(&jif); data.now = get_jiffies_update(&jif);
data.childmask = tmc->groupmask; data.childmask = tmc->groupmask;
data.firstexp = KTIME_MAX; data.firstexp = KTIME_MAX;
data.tmc_active = !tmc->idle;
data.check = false; data.check = false;
/* /*
@ -1432,38 +1461,43 @@ static long tmigr_trigger_active(void *unused)
{ {
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
WARN_ON_ONCE(!tmc->online || tmc->idle); WARN_ON_ONCE(!tmc->available || tmc->idle);
return 0; return 0;
} }
static int tmigr_cpu_offline(unsigned int cpu) static int tmigr_clear_cpu_available(unsigned int cpu)
{ {
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
int migrator; int migrator;
u64 firstexp; u64 firstexp;
raw_spin_lock_irq(&tmc->lock); guard(mutex)(&tmigr_available_mutex);
tmc->online = false;
WRITE_ONCE(tmc->wakeup, KTIME_MAX);
/* cpumask_clear_cpu(cpu, tmigr_available_cpumask);
* CPU has to handle the local events on his own, when on the way to scoped_guard(raw_spinlock_irq, &tmc->lock) {
* offline; Therefore nextevt value is set to KTIME_MAX if (!tmc->available)
*/ return 0;
firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); tmc->available = false;
trace_tmigr_cpu_offline(tmc); WRITE_ONCE(tmc->wakeup, KTIME_MAX);
raw_spin_unlock_irq(&tmc->lock);
/*
* CPU has to handle the local events on his own, when on the way to
* offline; Therefore nextevt value is set to KTIME_MAX
*/
firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
trace_tmigr_cpu_unavailable(tmc);
}
if (firstexp != KTIME_MAX) { if (firstexp != KTIME_MAX) {
migrator = cpumask_any_but(cpu_online_mask, cpu); migrator = cpumask_any(tmigr_available_cpumask);
work_on_cpu(migrator, tmigr_trigger_active, NULL); work_on_cpu(migrator, tmigr_trigger_active, NULL);
} }
return 0; return 0;
} }
static int tmigr_cpu_online(unsigned int cpu) static int tmigr_set_cpu_available(unsigned int cpu)
{ {
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@ -1471,16 +1505,123 @@ static int tmigr_cpu_online(unsigned int cpu)
if (WARN_ON_ONCE(!tmc->tmgroup)) if (WARN_ON_ONCE(!tmc->tmgroup))
return -EINVAL; return -EINVAL;
raw_spin_lock_irq(&tmc->lock); if (tmigr_is_isolated(cpu))
trace_tmigr_cpu_online(tmc); return 0;
tmc->idle = timer_base_is_idle();
if (!tmc->idle) guard(mutex)(&tmigr_available_mutex);
__tmigr_cpu_activate(tmc);
tmc->online = true; cpumask_set_cpu(cpu, tmigr_available_cpumask);
raw_spin_unlock_irq(&tmc->lock); scoped_guard(raw_spinlock_irq, &tmc->lock) {
if (tmc->available)
return 0;
trace_tmigr_cpu_available(tmc);
tmc->idle = timer_base_is_idle();
if (!tmc->idle)
__tmigr_cpu_activate(tmc);
tmc->available = true;
}
return 0; return 0;
} }
static void tmigr_cpu_isolate(struct work_struct *ignored)
{
tmigr_clear_cpu_available(smp_processor_id());
}
static void tmigr_cpu_unisolate(struct work_struct *ignored)
{
tmigr_set_cpu_available(smp_processor_id());
}
/**
* tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy
* @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy
*
* This function can be called from cpuset code to provide the new set of
* isolated CPUs that should be excluded from the hierarchy.
* Online CPUs not present in exclude_cpumask but already excluded are brought
* back to the hierarchy.
* Functions to isolate/unisolate need to be called locally and can sleep.
*/
int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
{
struct work_struct __percpu *works __free(free_percpu) =
alloc_percpu(struct work_struct);
cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
int cpu;
lockdep_assert_cpus_held();
if (!works)
return -ENOMEM;
if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
return -ENOMEM;
/*
* First set previously isolated CPUs as available (unisolate).
* This cpumask contains only CPUs that switched to available now.
*/
cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask);
cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask);
for_each_cpu(cpu, cpumask) {
struct work_struct *work = per_cpu_ptr(works, cpu);
INIT_WORK(work, tmigr_cpu_unisolate);
schedule_work_on(cpu, work);
}
for_each_cpu(cpu, cpumask)
flush_work(per_cpu_ptr(works, cpu));
/*
* Then clear previously available CPUs (isolate).
* This cpumask contains only CPUs that switched to not available now.
* There cannot be overlap with the newly available ones.
*/
cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask);
cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
/*
* Handle this here and not in the cpuset code because exclude_cpumask
* might include also the tick CPU if included in isolcpus.
*/
for_each_cpu(cpu, cpumask) {
if (!tick_nohz_cpu_hotpluggable(cpu)) {
cpumask_clear_cpu(cpu, cpumask);
break;
}
}
for_each_cpu(cpu, cpumask) {
struct work_struct *work = per_cpu_ptr(works, cpu);
INIT_WORK(work, tmigr_cpu_isolate);
schedule_work_on(cpu, work);
}
for_each_cpu(cpu, cpumask)
flush_work(per_cpu_ptr(works, cpu));
return 0;
}
static int __init tmigr_init_isolation(void)
{
cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
static_branch_enable(&tmigr_exclude_isolated);
if (!housekeeping_enabled(HK_TYPE_DOMAIN))
return 0;
if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
return -ENOMEM;
cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
/* Protect against RCU torture hotplug testing */
guard(cpus_read_lock)();
return tmigr_isolated_exclude_cpumask(cpumask);
}
late_initcall(tmigr_init_isolation);
static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
int node) int node)
{ {
@ -1498,21 +1639,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
s.seq = 0; s.seq = 0;
atomic_set(&group->migr_state, s.state); atomic_set(&group->migr_state, s.state);
/*
* If this is a new top-level, prepare its groupmask in advance.
* This avoids accidents where yet another new top-level is
* created in the future and made visible before the current groupmask.
*/
if (list_empty(&tmigr_level_list[lvl])) {
group->groupmask = BIT(0);
/*
* The previous top level has prepared its groupmask already,
* simply account it as the first child.
*/
if (lvl > 0)
group->num_children = 1;
}
timerqueue_init_head(&group->events); timerqueue_init_head(&group->events);
timerqueue_init(&group->groupevt.nextevt); timerqueue_init(&group->groupevt.nextevt);
group->groupevt.nextevt.expires = KTIME_MAX; group->groupevt.nextevt.expires = KTIME_MAX;
@ -1520,8 +1646,7 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
group->groupevt.ignore = true; group->groupevt.ignore = true;
} }
static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
unsigned int lvl)
{ {
struct tmigr_group *tmp, *group = NULL; struct tmigr_group *tmp, *group = NULL;
@ -1567,25 +1692,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
return group; return group;
} }
static bool tmigr_init_root(struct tmigr_group *group, bool activate)
{
if (!group->parent && group != tmigr_root) {
/*
* This is the new top-level, prepare its groupmask in advance
* to avoid accidents where yet another new top-level is
* created in the future and made visible before this groupmask.
*/
group->groupmask = BIT(0);
WARN_ON_ONCE(activate);
return true;
}
return false;
}
static void tmigr_connect_child_parent(struct tmigr_group *child, static void tmigr_connect_child_parent(struct tmigr_group *child,
struct tmigr_group *parent, struct tmigr_group *parent,
bool activate) bool activate)
{ {
struct tmigr_walk data; if (tmigr_init_root(parent, activate)) {
raw_spin_lock_irq(&child->lock);
raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
if (activate) {
/* /*
* @child is the old top and @parent the new one. In this * The previous top level had prepared its groupmask already,
* case groupmask is pre-initialized and @child already * simply account it in advance as the first child. If some groups
* accounted, along with its new sibling corresponding to the * have been created between the old and new root due to node
* CPU going up. * mismatch, the new root's child will be intialized accordingly.
*/ */
WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); parent->num_children = 1;
}
/* Connecting old root to new root ? */
if (!parent->parent && activate) {
/*
* @child is the old top, or in case of node mismatch, some
* intermediate group between the old top and the new one in
* @parent. In this case the @child must be pre-accounted above
* as the first child. Its new inactive sibling corresponding
* to the CPU going up has been accounted as the second child.
*/
WARN_ON_ONCE(parent->num_children != 2);
child->groupmask = BIT(0);
} else { } else {
/* Adding @child for the CPU going up to @parent. */ /* Common case adding @child for the CPU going up to @parent. */
child->groupmask = BIT(parent->num_children++); child->groupmask = BIT(parent->num_children++);
} }
@ -1596,87 +1747,61 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
*/ */
smp_store_release(&child->parent, parent); smp_store_release(&child->parent, parent);
raw_spin_unlock(&parent->lock);
raw_spin_unlock_irq(&child->lock);
trace_tmigr_connect_child_parent(child); trace_tmigr_connect_child_parent(child);
if (!activate)
return;
/*
* To prevent inconsistent states, active children need to be active in
* the new parent as well. Inactive children are already marked inactive
* in the parent group:
*
* * When new groups were created by tmigr_setup_groups() starting from
* the lowest level (and not higher then one level below the current
* top level), then they are not active. They will be set active when
* the new online CPU comes active.
*
* * But if a new group above the current top level is required, it is
* mandatory to propagate the active state of the already existing
* child to the new parent. So tmigr_connect_child_parent() is
* executed with the formerly top level group (child) and the newly
* created group (parent).
*
* * It is ensured that the child is active, as this setup path is
* executed in hotplug prepare callback. This is exectued by an
* already connected and !idle CPU. Even if all other CPUs go idle,
* the CPU executing the setup will be responsible up to current top
* level group. And the next time it goes inactive, it will release
* the new childmask and parent to subsequent walkers through this
* @child. Therefore propagate active state unconditionally.
*/
data.childmask = child->groupmask;
/*
* There is only one new level per time (which is protected by
* tmigr_mutex). When connecting the child and the parent and set the
* child active when the parent is inactive, the parent needs to be the
* uppermost level. Otherwise there went something wrong!
*/
WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
} }
static int tmigr_setup_groups(unsigned int cpu, unsigned int node) static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
struct tmigr_group *start, bool activate)
{ {
struct tmigr_group *group, *child, **stack; struct tmigr_group *group, *child, **stack;
int top = 0, err = 0, i = 0; int i, top = 0, err = 0, start_lvl = 0;
struct list_head *lvllist; bool root_mismatch = false;
stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL);
if (!stack) if (!stack)
return -ENOMEM; return -ENOMEM;
do { if (start) {
group = tmigr_get_group(cpu, node, i); stack[start->level] = start;
start_lvl = start->level + 1;
}
if (tmigr_root)
root_mismatch = tmigr_root->numa_node != node;
for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
group = tmigr_get_group(node, i);
if (IS_ERR(group)) { if (IS_ERR(group)) {
err = PTR_ERR(group); err = PTR_ERR(group);
i--;
break; break;
} }
top = i; top = i;
stack[i++] = group; stack[i] = group;
/* /*
* When booting only less CPUs of a system than CPUs are * When booting only less CPUs of a system than CPUs are
* available, not all calculated hierarchy levels are required. * available, not all calculated hierarchy levels are required,
* unless a node mismatch is detected.
* *
* The loop is aborted as soon as the highest level, which might * The loop is aborted as soon as the highest level, which might
* be different from tmigr_hierarchy_levels, contains only a * be different from tmigr_hierarchy_levels, contains only a
* single group. * single group, unless the nodes mismatch below tmigr_crossnode_level
*/ */
if (group->parent || list_is_singular(&tmigr_level_list[i - 1])) if (group->parent)
break; break;
if ((!root_mismatch || i >= tmigr_crossnode_level) &&
list_is_singular(&tmigr_level_list[i]))
break;
}
} while (i < tmigr_hierarchy_levels); /* Assert single root without parent */
if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels))
return -EINVAL;
/* Assert single root */ for (; i >= start_lvl; i--) {
WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); group = stack[i];
while (i > 0) {
group = stack[--i];
if (err < 0) { if (err < 0) {
list_del(&group->list); list_del(&group->list);
@ -1692,12 +1817,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
if (i == 0) { if (i == 0) {
struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
raw_spin_lock_irq(&group->lock);
tmc->tmgroup = group; tmc->tmgroup = group;
tmc->groupmask = BIT(group->num_children++); tmc->groupmask = BIT(group->num_children++);
raw_spin_unlock_irq(&group->lock); tmigr_init_root(group, activate);
trace_tmigr_connect_cpu_parent(tmc); trace_tmigr_connect_cpu_parent(tmc);
@ -1705,42 +1828,58 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
continue; continue;
} else { } else {
child = stack[i - 1]; child = stack[i - 1];
/* Will be activated at online time */ tmigr_connect_child_parent(child, group, activate);
tmigr_connect_child_parent(child, group, false);
}
/* check if uppermost level was newly created */
if (top != i)
continue;
WARN_ON_ONCE(top == 0);
lvllist = &tmigr_level_list[top];
/*
* Newly created root level should have accounted the upcoming
* CPU's child group and pre-accounted the old root.
*/
if (group->num_children == 2 && list_is_singular(lvllist)) {
/*
* The target CPU must never do the prepare work, except
* on early boot when the boot CPU is the target. Otherwise
* it may spuriously activate the old top level group inside
* the new one (nevertheless whether old top level group is
* active or not) and/or release an uninitialized childmask.
*/
WARN_ON_ONCE(cpu == raw_smp_processor_id());
lvllist = &tmigr_level_list[top - 1];
list_for_each_entry(child, lvllist, list) {
if (child->parent)
continue;
tmigr_connect_child_parent(child, group, true);
}
} }
} }
if (err < 0)
goto out;
if (activate) {
struct tmigr_walk data;
union tmigr_state state;
/*
* To prevent inconsistent states, active children need to be active in
* the new parent as well. Inactive children are already marked inactive
* in the parent group:
*
* * When new groups were created by tmigr_setup_groups() starting from
* the lowest level, then they are not active. They will be set active
* when the new online CPU comes active.
*
* * But if new groups above the current top level are required, it is
* mandatory to propagate the active state of the already existing
* child to the new parents. So tmigr_active_up() activates the
* new parents while walking up from the old root to the new.
*
* * It is ensured that @start is active, as this setup path is
* executed in hotplug prepare callback. This is executed by an
* already connected and !idle CPU. Even if all other CPUs go idle,
* the CPU executing the setup will be responsible up to current top
* level group. And the next time it goes inactive, it will release
* the new childmask and parent to subsequent walkers through this
* @child. Therefore propagate active state unconditionally.
*/
state.state = atomic_read(&start->migr_state);
WARN_ON_ONCE(!state.active);
WARN_ON_ONCE(!start->parent);
data.childmask = start->groupmask;
__walk_groups_from(tmigr_active_up, &data, start, start->parent);
}
/* Root update */
if (list_is_singular(&tmigr_level_list[top])) {
group = list_first_entry(&tmigr_level_list[top],
typeof(*group), list);
WARN_ON_ONCE(group->parent);
if (tmigr_root) {
/* Old root should be the same or below */
WARN_ON_ONCE(tmigr_root->level > top);
}
tmigr_root = group;
}
out:
kfree(stack); kfree(stack);
return err; return err;
@ -1748,12 +1887,31 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
static int tmigr_add_cpu(unsigned int cpu) static int tmigr_add_cpu(unsigned int cpu)
{ {
struct tmigr_group *old_root = tmigr_root;
int node = cpu_to_node(cpu); int node = cpu_to_node(cpu);
int ret; int ret;
mutex_lock(&tmigr_mutex); guard(mutex)(&tmigr_mutex);
ret = tmigr_setup_groups(cpu, node);
mutex_unlock(&tmigr_mutex); ret = tmigr_setup_groups(cpu, node, NULL, false);
/* Root has changed? Connect the old one to the new */
if (ret >= 0 && old_root && old_root != tmigr_root) {
/*
* The target CPU must never do the prepare work, except
* on early boot when the boot CPU is the target. Otherwise
* it may spuriously activate the old top level group inside
* the new one (nevertheless whether old top level group is
* active or not) and/or release an uninitialized childmask.
*/
WARN_ON_ONCE(cpu == raw_smp_processor_id());
/*
* The (likely) current CPU is expected to be online in the hierarchy,
* otherwise the old root may not be active as expected.
*/
WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
}
return ret; return ret;
} }
@ -1798,6 +1956,11 @@ static int __init tmigr_init(void)
if (ncpus == 1) if (ncpus == 1)
return 0; return 0;
if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) {
ret = -ENOMEM;
goto err;
}
/* /*
* Calculate the required hierarchy levels. Unfortunately there is no * Calculate the required hierarchy levels. Unfortunately there is no
* reliable information available, unless all possible CPUs have been * reliable information available, unless all possible CPUs have been
@ -1847,7 +2010,7 @@ static int __init tmigr_init(void)
goto err; goto err;
ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online",
tmigr_cpu_online, tmigr_cpu_offline); tmigr_set_cpu_available, tmigr_clear_cpu_available);
if (ret) if (ret)
goto err; goto err;

View File

@ -97,7 +97,7 @@ struct tmigr_group {
*/ */
struct tmigr_cpu { struct tmigr_cpu {
raw_spinlock_t lock; raw_spinlock_t lock;
bool online; bool available;
bool idle; bool idle;
bool remote; bool remote;
struct tmigr_group *tmgroup; struct tmigr_group *tmgroup;

View File

@ -116,6 +116,56 @@ int nanosleep_test(int clockid, long long ns)
return 0; return 0;
} }
static void dummy_event_handler(int val)
{
/* No action needed */
}
static int nanosleep_test_remaining(int clockid)
{
struct timespec rqtp = {}, rmtp = {};
struct itimerspec itimer = {};
struct sigaction sa = {};
timer_t timer;
int ret;
sa.sa_handler = dummy_event_handler;
ret = sigaction(SIGALRM, &sa, NULL);
if (ret)
return -1;
ret = timer_create(clockid, NULL, &timer);
if (ret)
return -1;
itimer.it_value.tv_nsec = NSEC_PER_SEC / 4;
ret = timer_settime(timer, 0, &itimer, NULL);
if (ret)
return -1;
rqtp.tv_nsec = NSEC_PER_SEC / 2;
ret = clock_nanosleep(clockid, 0, &rqtp, &rmtp);
if (ret != EINTR)
return -1;
ret = timer_delete(timer);
if (ret)
return -1;
sa.sa_handler = SIG_DFL;
ret = sigaction(SIGALRM, &sa, NULL);
if (ret)
return -1;
if (!in_order((struct timespec) {}, rmtp))
return -1;
if (!in_order(rmtp, rqtp))
return -1;
return 0;
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
long long length; long long length;
@ -150,6 +200,11 @@ int main(int argc, char **argv)
} }
length *= 100; length *= 100;
} }
ret = nanosleep_test_remaining(clockid);
if (ret < 0) {
ksft_test_result_fail("%-31s\n", clockstring(clockid));
ksft_exit_fail();
}
ksft_test_result_pass("%-31s\n", clockstring(clockid)); ksft_test_result_pass("%-31s\n", clockstring(clockid));
next: next:
ret = 0; ret = 0;

View File

@ -18,6 +18,7 @@
#include <time.h> #include <time.h>
#include <include/vdso/time64.h> #include <include/vdso/time64.h>
#include <pthread.h> #include <pthread.h>
#include <stdbool.h>
#include "../kselftest.h" #include "../kselftest.h"
@ -670,8 +671,14 @@ static void check_timer_create_exact(void)
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
bool run_sig_ign_tests = ksft_min_kernel_version(6, 13);
ksft_print_header(); ksft_print_header();
ksft_set_plan(19); if (run_sig_ign_tests) {
ksft_set_plan(19);
} else {
ksft_set_plan(10);
}
ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n");
ksft_print_msg("based timers if other threads run on the CPU...\n"); ksft_print_msg("based timers if other threads run on the CPU...\n");
@ -695,15 +702,20 @@ int main(int argc, char **argv)
check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_timer_create(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
check_timer_distribution(); check_timer_distribution();
check_sig_ign(0); if (run_sig_ign_tests) {
check_sig_ign(1); check_sig_ign(0);
check_rearm(); check_sig_ign(1);
check_delete(); check_rearm();
check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); check_delete();
check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_sigev_none(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); check_sigev_none(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_gettime(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); check_gettime(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
check_gettime(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");
} else {
ksft_print_msg("Skipping SIG_IGN tests on kernel < 6.13\n");
}
check_overrun(CLOCK_MONOTONIC, "CLOCK_MONOTONIC"); check_overrun(CLOCK_MONOTONIC, "CLOCK_MONOTONIC");
check_overrun(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID"); check_overrun(CLOCK_PROCESS_CPUTIME_ID, "CLOCK_PROCESS_CPUTIME_ID");
check_overrun(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID"); check_overrun(CLOCK_THREAD_CPUTIME_ID, "CLOCK_THREAD_CPUTIME_ID");