mirror of https://github.com/torvalds/linux.git
- Reset hrtimers correctly when a CPU hotplug state traversal happens
"half-ways" and leaves hrtimers not (re-)initialized properly - Annotate accesses to a timer group's ignore flag to prevent KCSAN from raising data_race warnings - Make sure timer group initialization is visible to timer tree walkers and avoid a hypothetical race - Fix another race between CPU hotplug and idle entry/exit where timers on a fully idle system are getting ignored - Fix a case where an ignored signal is still being handled which it shouldn't be -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmeM4vgACgkQEsHwGGHe VUpw3RAAnKu9B1bizT4lAfK4dsh9I/beDmeCaLZ8kWWq6Ot4LJry8qsAnr8Pami9 WsFxn/LSCvqsTOZ5aSuYNyWpeUJBrHzCvaTrdpOfgpgq1eZzS/iuuZBXJtV018dp DKVnjHSjCseXW9EfquXHcHIaHswzvdAsep/y8qTRiJlJ13lmihGOxbQDjQTWP9mm geYDXKaih25dMeON7RJ+ND6Lnai8lfxyncv7HPE5wAo4Exr5jCDPIBHSXhE4Hgde f4SnBnvevJCDKwbrvpN4ecgFLBj4FIe80TiKFpGDk1rFG0myLQrDps6RMfZ6XzB4 duEu5chfsECbp3ioq7nC/6hRuJCIXJrN20Ij0MQAGFGPKO+uKlSfWVY4Zqc3lbxZ xROHxTeYYram8RIzclUnwhprZug5SN3PUyyvAYgswLxc4tKJ39elHIkh5c+TReHK qFG+//Xnyd9dmYNb+SwbDRps17F50bJLDhOS+7FwkYRkCG8NoxlR2EAJeI0RGBzL 1B9EE5Nht9awwzWtVCbJizyfbkRZSok5lToyqPIyjsrmueiz+qiZDnAdqhiArfJ7 ThcN21BKSvMBJ5/tEyAPfE7dK08Sxq6oBuP8mtscv0CK9cU1sis99/0COwfclhlm 8EfAyHpIz7cMsmnENAURMAyeB3L4HRIc8FJUw60IO9HRe6RH1Hs= =Xfav -----END PGP SIGNATURE----- Merge tag 'timers_urgent_for_v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull timer fixes from Borislav Petkov: - Reset hrtimers correctly when a CPU hotplug state traversal happens "half-ways" and leaves hrtimers not (re-)initialized properly - Annotate accesses to a timer group's ignore flag to prevent KCSAN from raising data_race warnings - Make sure timer group initialization is visible to timer tree walkers and avoid a hypothetical race - Fix another race between CPU hotplug and idle entry/exit where timers on a fully idle system are getting ignored - Fix a case where an ignored signal is still being handled which it shouldn't be * tag 'timers_urgent_for_v6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: hrtimers: Handle CPU state correctly on hotplug timers/migration: Annotate accesses to ignore flag timers/migration: Enforce group initialization visibility to tree walkers timers/migration: Fix another race between hotplug and idle entry/exit signal/posixtimers: Handle ignore/blocked sequences correctly
This commit is contained in:
commit
25144ea31b
|
|
@ -386,6 +386,7 @@ extern void __init hrtimers_init(void);
|
||||||
extern void sysrq_timer_list_show(void);
|
extern void sysrq_timer_list_show(void);
|
||||||
|
|
||||||
int hrtimers_prepare_cpu(unsigned int cpu);
|
int hrtimers_prepare_cpu(unsigned int cpu);
|
||||||
|
int hrtimers_cpu_starting(unsigned int cpu);
|
||||||
#ifdef CONFIG_HOTPLUG_CPU
|
#ifdef CONFIG_HOTPLUG_CPU
|
||||||
int hrtimers_cpu_dying(unsigned int cpu);
|
int hrtimers_cpu_dying(unsigned int cpu);
|
||||||
#else
|
#else
|
||||||
|
|
|
||||||
|
|
@ -2179,7 +2179,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
|
||||||
},
|
},
|
||||||
[CPUHP_AP_HRTIMERS_DYING] = {
|
[CPUHP_AP_HRTIMERS_DYING] = {
|
||||||
.name = "hrtimers:dying",
|
.name = "hrtimers:dying",
|
||||||
.startup.single = NULL,
|
.startup.single = hrtimers_cpu_starting,
|
||||||
.teardown.single = hrtimers_cpu_dying,
|
.teardown.single = hrtimers_cpu_dying,
|
||||||
},
|
},
|
||||||
[CPUHP_AP_TICK_DYING] = {
|
[CPUHP_AP_TICK_DYING] = {
|
||||||
|
|
|
||||||
|
|
@ -2007,11 +2007,22 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
|
||||||
|
|
||||||
if (!list_empty(&q->list)) {
|
if (!list_empty(&q->list)) {
|
||||||
/*
|
/*
|
||||||
* If task group is exiting with the signal already pending,
|
* The signal was ignored and blocked. The timer
|
||||||
* wait for __exit_signal() to do its job. Otherwise if
|
* expiry queued it because blocked signals are
|
||||||
* ignored, it's not supposed to be queued. Try to survive.
|
* queued independent of the ignored state.
|
||||||
|
*
|
||||||
|
* The unblocking set SIGPENDING, but the signal
|
||||||
|
* was not yet dequeued from the pending list.
|
||||||
|
* So prepare_signal() sees unblocked and ignored,
|
||||||
|
* which ends up here. Leave it queued like a
|
||||||
|
* regular signal.
|
||||||
|
*
|
||||||
|
* The same happens when the task group is exiting
|
||||||
|
* and the signal is already queued.
|
||||||
|
* prepare_signal() treats SIGNAL_GROUP_EXIT as
|
||||||
|
* ignored independent of its queued state. This
|
||||||
|
* gets cleaned up in __exit_signal().
|
||||||
*/
|
*/
|
||||||
WARN_ON_ONCE(!(t->signal->flags & SIGNAL_GROUP_EXIT));
|
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2046,17 +2057,25 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This should never happen and leaks a reference count */
|
|
||||||
if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list)))
|
|
||||||
hlist_del_init(&tmr->ignored_list);
|
|
||||||
|
|
||||||
if (unlikely(!list_empty(&q->list))) {
|
if (unlikely(!list_empty(&q->list))) {
|
||||||
/* This holds a reference count already */
|
/* This holds a reference count already */
|
||||||
result = TRACE_SIGNAL_ALREADY_PENDING;
|
result = TRACE_SIGNAL_ALREADY_PENDING;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the signal is on the ignore list, it got blocked after it was
|
||||||
|
* ignored earlier. But nothing lifted the ignore. Move it back to
|
||||||
|
* the pending list to be consistent with the regular signal
|
||||||
|
* handling. This already holds a reference count.
|
||||||
|
*
|
||||||
|
* If it's not on the ignore list acquire a reference count.
|
||||||
|
*/
|
||||||
|
if (likely(hlist_unhashed(&tmr->ignored_list)))
|
||||||
posixtimer_sigqueue_getref(q);
|
posixtimer_sigqueue_getref(q);
|
||||||
|
else
|
||||||
|
hlist_del_init(&tmr->ignored_list);
|
||||||
|
|
||||||
posixtimer_queue_sigqueue(q, t, tmr->it_pid_type);
|
posixtimer_queue_sigqueue(q, t, tmr->it_pid_type);
|
||||||
result = TRACE_SIGNAL_DELIVERED;
|
result = TRACE_SIGNAL_DELIVERED;
|
||||||
out:
|
out:
|
||||||
|
|
|
||||||
|
|
@ -2202,6 +2202,15 @@ int hrtimers_prepare_cpu(unsigned int cpu)
|
||||||
}
|
}
|
||||||
|
|
||||||
cpu_base->cpu = cpu;
|
cpu_base->cpu = cpu;
|
||||||
|
hrtimer_cpu_base_init_expiry_lock(cpu_base);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int hrtimers_cpu_starting(unsigned int cpu)
|
||||||
|
{
|
||||||
|
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
|
||||||
|
|
||||||
|
/* Clear out any left over state from a CPU down operation */
|
||||||
cpu_base->active_bases = 0;
|
cpu_base->active_bases = 0;
|
||||||
cpu_base->hres_active = 0;
|
cpu_base->hres_active = 0;
|
||||||
cpu_base->hang_detected = 0;
|
cpu_base->hang_detected = 0;
|
||||||
|
|
@ -2210,7 +2219,6 @@ int hrtimers_prepare_cpu(unsigned int cpu)
|
||||||
cpu_base->expires_next = KTIME_MAX;
|
cpu_base->expires_next = KTIME_MAX;
|
||||||
cpu_base->softirq_expires_next = KTIME_MAX;
|
cpu_base->softirq_expires_next = KTIME_MAX;
|
||||||
cpu_base->online = 1;
|
cpu_base->online = 1;
|
||||||
hrtimer_cpu_base_init_expiry_lock(cpu_base);
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -2286,5 +2294,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
|
||||||
void __init hrtimers_init(void)
|
void __init hrtimers_init(void)
|
||||||
{
|
{
|
||||||
hrtimers_prepare_cpu(smp_processor_id());
|
hrtimers_prepare_cpu(smp_processor_id());
|
||||||
|
hrtimers_cpu_starting(smp_processor_id());
|
||||||
open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
|
open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
|
||||||
break;
|
break;
|
||||||
|
|
||||||
child = group;
|
child = group;
|
||||||
group = group->parent;
|
/*
|
||||||
|
* Pairs with the store release on group connection
|
||||||
|
* to make sure group initialization is visible.
|
||||||
|
*/
|
||||||
|
group = READ_ONCE(group->parent);
|
||||||
data->childmask = child->groupmask;
|
data->childmask = child->groupmask;
|
||||||
|
WARN_ON_ONCE(!data->childmask);
|
||||||
} while (group);
|
} while (group);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -564,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group)
|
||||||
while ((node = timerqueue_getnext(&group->events))) {
|
while ((node = timerqueue_getnext(&group->events))) {
|
||||||
evt = container_of(node, struct tmigr_event, nextevt);
|
evt = container_of(node, struct tmigr_event, nextevt);
|
||||||
|
|
||||||
if (!evt->ignore) {
|
if (!READ_ONCE(evt->ignore)) {
|
||||||
WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
|
WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
|
||||||
return evt;
|
return evt;
|
||||||
}
|
}
|
||||||
|
|
@ -660,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group,
|
||||||
* lock is held while updating the ignore flag in idle path. So this
|
* lock is held while updating the ignore flag in idle path. So this
|
||||||
* state change will not be lost.
|
* state change will not be lost.
|
||||||
*/
|
*/
|
||||||
group->groupevt.ignore = true;
|
WRITE_ONCE(group->groupevt.ignore, true);
|
||||||
|
|
||||||
return walk_done;
|
return walk_done;
|
||||||
}
|
}
|
||||||
|
|
@ -721,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
|
||||||
union tmigr_state childstate, groupstate;
|
union tmigr_state childstate, groupstate;
|
||||||
bool remote = data->remote;
|
bool remote = data->remote;
|
||||||
bool walk_done = false;
|
bool walk_done = false;
|
||||||
|
bool ignore;
|
||||||
u64 nextexp;
|
u64 nextexp;
|
||||||
|
|
||||||
if (child) {
|
if (child) {
|
||||||
|
|
@ -739,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
|
||||||
nextexp = child->next_expiry;
|
nextexp = child->next_expiry;
|
||||||
evt = &child->groupevt;
|
evt = &child->groupevt;
|
||||||
|
|
||||||
evt->ignore = (nextexp == KTIME_MAX) ? true : false;
|
/*
|
||||||
|
* This can race with concurrent idle exit (activate).
|
||||||
|
* If the current writer wins, a useless remote expiration may
|
||||||
|
* be scheduled. If the activate wins, the event is properly
|
||||||
|
* ignored.
|
||||||
|
*/
|
||||||
|
ignore = (nextexp == KTIME_MAX) ? true : false;
|
||||||
|
WRITE_ONCE(evt->ignore, ignore);
|
||||||
} else {
|
} else {
|
||||||
nextexp = data->nextexp;
|
nextexp = data->nextexp;
|
||||||
|
|
||||||
first_childevt = evt = data->evt;
|
first_childevt = evt = data->evt;
|
||||||
|
ignore = evt->ignore;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Walking the hierarchy is required in any case when a
|
* Walking the hierarchy is required in any case when a
|
||||||
|
|
@ -769,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
|
||||||
* first event information of the group is updated properly and
|
* first event information of the group is updated properly and
|
||||||
* also handled properly, so skip this fast return path.
|
* also handled properly, so skip this fast return path.
|
||||||
*/
|
*/
|
||||||
if (evt->ignore && !remote && group->parent)
|
if (ignore && !remote && group->parent)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
raw_spin_lock(&group->lock);
|
raw_spin_lock(&group->lock);
|
||||||
|
|
@ -783,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
|
||||||
* queue when the expiry time changed only or when it could be ignored.
|
* queue when the expiry time changed only or when it could be ignored.
|
||||||
*/
|
*/
|
||||||
if (timerqueue_node_queued(&evt->nextevt)) {
|
if (timerqueue_node_queued(&evt->nextevt)) {
|
||||||
if ((evt->nextevt.expires == nextexp) && !evt->ignore) {
|
if ((evt->nextevt.expires == nextexp) && !ignore) {
|
||||||
/* Make sure not to miss a new CPU event with the same expiry */
|
/* Make sure not to miss a new CPU event with the same expiry */
|
||||||
evt->cpu = first_childevt->cpu;
|
evt->cpu = first_childevt->cpu;
|
||||||
goto check_toplvl;
|
goto check_toplvl;
|
||||||
|
|
@ -793,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
|
||||||
WRITE_ONCE(group->next_expiry, KTIME_MAX);
|
WRITE_ONCE(group->next_expiry, KTIME_MAX);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (evt->ignore) {
|
if (ignore) {
|
||||||
/*
|
/*
|
||||||
* When the next child event could be ignored (nextexp is
|
* When the next child event could be ignored (nextexp is
|
||||||
* KTIME_MAX) and there was no remote timer handling before or
|
* KTIME_MAX) and there was no remote timer handling before or
|
||||||
|
|
@ -1487,6 +1501,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
|
||||||
s.seq = 0;
|
s.seq = 0;
|
||||||
atomic_set(&group->migr_state, s.state);
|
atomic_set(&group->migr_state, s.state);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If this is a new top-level, prepare its groupmask in advance.
|
||||||
|
* This avoids accidents where yet another new top-level is
|
||||||
|
* created in the future and made visible before the current groupmask.
|
||||||
|
*/
|
||||||
|
if (list_empty(&tmigr_level_list[lvl])) {
|
||||||
|
group->groupmask = BIT(0);
|
||||||
|
/*
|
||||||
|
* The previous top level has prepared its groupmask already,
|
||||||
|
* simply account it as the first child.
|
||||||
|
*/
|
||||||
|
if (lvl > 0)
|
||||||
|
group->num_children = 1;
|
||||||
|
}
|
||||||
|
|
||||||
timerqueue_init_head(&group->events);
|
timerqueue_init_head(&group->events);
|
||||||
timerqueue_init(&group->groupevt.nextevt);
|
timerqueue_init(&group->groupevt.nextevt);
|
||||||
group->groupevt.nextevt.expires = KTIME_MAX;
|
group->groupevt.nextevt.expires = KTIME_MAX;
|
||||||
|
|
@ -1550,8 +1579,25 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
|
||||||
raw_spin_lock_irq(&child->lock);
|
raw_spin_lock_irq(&child->lock);
|
||||||
raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
|
raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
|
||||||
|
|
||||||
child->parent = parent;
|
if (activate) {
|
||||||
|
/*
|
||||||
|
* @child is the old top and @parent the new one. In this
|
||||||
|
* case groupmask is pre-initialized and @child already
|
||||||
|
* accounted, along with its new sibling corresponding to the
|
||||||
|
* CPU going up.
|
||||||
|
*/
|
||||||
|
WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2);
|
||||||
|
} else {
|
||||||
|
/* Adding @child for the CPU going up to @parent. */
|
||||||
child->groupmask = BIT(parent->num_children++);
|
child->groupmask = BIT(parent->num_children++);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Make sure parent initialization is visible before publishing it to a
|
||||||
|
* racing CPU entering/exiting idle. This RELEASE barrier enforces an
|
||||||
|
* address dependency that pairs with the READ_ONCE() in __walk_groups().
|
||||||
|
*/
|
||||||
|
smp_store_release(&child->parent, parent);
|
||||||
|
|
||||||
raw_spin_unlock(&parent->lock);
|
raw_spin_unlock(&parent->lock);
|
||||||
raw_spin_unlock_irq(&child->lock);
|
raw_spin_unlock_irq(&child->lock);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue