Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'timers-core-2020-08-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull more timer updates from Thomas Gleixner:
"A set of posix CPU timer changes which allows to defer the heavy work
of posix CPU timers into task work context. The tick interrupt is
reduced to a quick check which queues the work which is doing the
heavy lifting before returning to user space or going back to guest
mode. Moving this out is deferring the signal delivery slightly but
posix CPU timers are inaccurate by nature as they depend on the tick
so there is no real damage. The relevant test cases all passed.

This lifts the last offender for RT out of the hard interrupt context
tick handler, but it also has the general benefit that the actual
heavy work is accounted to the task/process and not to the tick
interrupt itself.

Further optimizations are possible to break long sighand lock hold and
interrupt disabled (on !RT kernels) times when a massive amount of
posix CPU timers (which are unpriviledged) is armed for a
task/process.

This is currently only enabled for x86 because the architecture has to
ensure that task work is handled in KVM before entering a guest, which
was just established for x86 with the new common entry/exit code which
got merged post 5.8 and is not the case for other KVM architectures"

* tag 'timers-core-2020-08-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86: Select POSIX_CPU_TIMERS_TASK_WORK
posix-cpu-timers: Provide mechanisms to defer timer handling to task_work
posix-cpu-timers: Split run_posix_cpu_timers()

+227 -29
+1
arch/x86/Kconfig
··· 209 209 select HAVE_PERF_REGS 210 210 select HAVE_PERF_USER_STACK_DUMP 211 211 select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT 212 + select HAVE_POSIX_CPU_TIMERS_TASK_WORK 212 213 select HAVE_REGS_AND_STACK_ACCESS_API 213 214 select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION 214 215 select HAVE_FUNCTION_ARG_ACCESS_API
+17
include/linux/posix-timers.h
··· 6 6 #include <linux/list.h> 7 7 #include <linux/alarmtimer.h> 8 8 #include <linux/timerqueue.h> 9 + #include <linux/task_work.h> 9 10 10 11 struct kernel_siginfo; 11 12 struct task_struct; ··· 126 125 unsigned int expiry_active; 127 126 }; 128 127 128 + /** 129 + * posix_cputimers_work - Container for task work based posix CPU timer expiry 130 + * @work: The task work to be scheduled 131 + * @scheduled: @work has been scheduled already, no further processing 132 + */ 133 + struct posix_cputimers_work { 134 + struct callback_head work; 135 + unsigned int scheduled; 136 + }; 137 + 129 138 static inline void posix_cputimers_init(struct posix_cputimers *pct) 130 139 { 131 140 memset(pct, 0, sizeof(*pct)); ··· 174 163 static inline void posix_cputimers_init(struct posix_cputimers *pct) { } 175 164 static inline void posix_cputimers_group_init(struct posix_cputimers *pct, 176 165 u64 cpu_limit) { } 166 + #endif 167 + 168 + #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK 169 + void posix_cputimers_init_work(void); 170 + #else 171 + static inline void posix_cputimers_init_work(void) { } 177 172 #endif 178 173 179 174 #define REQUEUE_PENDING 1
+4
include/linux/sched.h
··· 890 890 /* Empty if CONFIG_POSIX_CPUTIMERS=n */ 891 891 struct posix_cputimers posix_cputimers; 892 892 893 + #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK 894 + struct posix_cputimers_work posix_cputimers_work; 895 + #endif 896 + 893 897 /* Process credentials: */ 894 898 895 899 /* Tracer's credentials at attach: */
+9
kernel/time/Kconfig
··· 52 52 config GENERIC_CMOS_UPDATE 53 53 bool 54 54 55 + # Select to handle posix CPU timers from task_work 56 + # and not from the timer interrupt context 57 + config HAVE_POSIX_CPU_TIMERS_TASK_WORK 58 + bool 59 + 60 + config POSIX_CPU_TIMERS_TASK_WORK 61 + bool 62 + default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK 63 + 55 64 if GENERIC_CLOCKEVENTS 56 65 menu "Timers subsystem" 57 66
+195 -29
kernel/time/posix-cpu-timers.c
··· 377 377 */ 378 378 static int posix_cpu_timer_create(struct k_itimer *new_timer) 379 379 { 380 + static struct lock_class_key posix_cpu_timers_key; 380 381 struct pid *pid; 381 382 382 383 rcu_read_lock(); ··· 386 385 rcu_read_unlock(); 387 386 return -EINVAL; 388 387 } 388 + 389 + /* 390 + * If posix timer expiry is handled in task work context then 391 + * timer::it_lock can be taken without disabling interrupts as all 392 + * other locking happens in task context. This requires a seperate 393 + * lock class key otherwise regular posix timer expiry would record 394 + * the lock class being taken in interrupt context and generate a 395 + * false positive warning. 396 + */ 397 + if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK)) 398 + lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key); 389 399 390 400 new_timer->kclock = &clock_posix_cpu; 391 401 timerqueue_init(&new_timer->it.cpu.node); ··· 1092 1080 return false; 1093 1081 } 1094 1082 1095 - /* 1096 - * This is called from the timer interrupt handler. The irq handler has 1097 - * already updated our counts. We need to check if any timers fire now. 1098 - * Interrupts are disabled. 1099 - */ 1100 - void run_posix_cpu_timers(void) 1083 + static void handle_posix_cpu_timers(struct task_struct *tsk); 1084 + 1085 + #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK 1086 + static void posix_cpu_timers_work(struct callback_head *work) 1101 1087 { 1102 - struct task_struct *tsk = current; 1088 + handle_posix_cpu_timers(current); 1089 + } 1090 + 1091 + /* 1092 + * Initialize posix CPU timers task work in init task. Out of line to 1093 + * keep the callback static and to avoid header recursion hell. 1094 + */ 1095 + void __init posix_cputimers_init_work(void) 1096 + { 1097 + init_task_work(&current->posix_cputimers_work.work, 1098 + posix_cpu_timers_work); 1099 + } 1100 + 1101 + /* 1102 + * Note: All operations on tsk->posix_cputimer_work.scheduled happen either 1103 + * in hard interrupt context or in task context with interrupts 1104 + * disabled. Aside of that the writer/reader interaction is always in the 1105 + * context of the current task, which means they are strict per CPU. 1106 + */ 1107 + static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) 1108 + { 1109 + return tsk->posix_cputimers_work.scheduled; 1110 + } 1111 + 1112 + static inline void __run_posix_cpu_timers(struct task_struct *tsk) 1113 + { 1114 + if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled)) 1115 + return; 1116 + 1117 + /* Schedule task work to actually expire the timers */ 1118 + tsk->posix_cputimers_work.scheduled = true; 1119 + task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME); 1120 + } 1121 + 1122 + static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, 1123 + unsigned long start) 1124 + { 1125 + bool ret = true; 1126 + 1127 + /* 1128 + * On !RT kernels interrupts are disabled while collecting expired 1129 + * timers, so no tick can happen and the fast path check can be 1130 + * reenabled without further checks. 1131 + */ 1132 + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { 1133 + tsk->posix_cputimers_work.scheduled = false; 1134 + return true; 1135 + } 1136 + 1137 + /* 1138 + * On RT enabled kernels ticks can happen while the expired timers 1139 + * are collected under sighand lock. But any tick which observes 1140 + * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath 1141 + * checks. So reenabling the tick work has do be done carefully: 1142 + * 1143 + * Disable interrupts and run the fast path check if jiffies have 1144 + * advanced since the collecting of expired timers started. If 1145 + * jiffies have not advanced or the fast path check did not find 1146 + * newly expired timers, reenable the fast path check in the timer 1147 + * interrupt. If there are newly expired timers, return false and 1148 + * let the collection loop repeat. 1149 + */ 1150 + local_irq_disable(); 1151 + if (start != jiffies && fastpath_timer_check(tsk)) 1152 + ret = false; 1153 + else 1154 + tsk->posix_cputimers_work.scheduled = false; 1155 + local_irq_enable(); 1156 + 1157 + return ret; 1158 + } 1159 + #else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ 1160 + static inline void __run_posix_cpu_timers(struct task_struct *tsk) 1161 + { 1162 + lockdep_posixtimer_enter(); 1163 + handle_posix_cpu_timers(tsk); 1164 + lockdep_posixtimer_exit(); 1165 + } 1166 + 1167 + static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) 1168 + { 1169 + return false; 1170 + } 1171 + 1172 + static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, 1173 + unsigned long start) 1174 + { 1175 + return true; 1176 + } 1177 + #endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ 1178 + 1179 + static void handle_posix_cpu_timers(struct task_struct *tsk) 1180 + { 1103 1181 struct k_itimer *timer, *next; 1104 - unsigned long flags; 1182 + unsigned long flags, start; 1105 1183 LIST_HEAD(firing); 1106 1184 1107 - lockdep_assert_irqs_disabled(); 1108 - 1109 - /* 1110 - * The fast path checks that there are no expired thread or thread 1111 - * group timers. If that's so, just return. 1112 - */ 1113 - if (!fastpath_timer_check(tsk)) 1185 + if (!lock_task_sighand(tsk, &flags)) 1114 1186 return; 1115 1187 1116 - lockdep_posixtimer_enter(); 1117 - if (!lock_task_sighand(tsk, &flags)) { 1118 - lockdep_posixtimer_exit(); 1119 - return; 1120 - } 1121 - /* 1122 - * Here we take off tsk->signal->cpu_timers[N] and 1123 - * tsk->cpu_timers[N] all the timers that are firing, and 1124 - * put them on the firing list. 1125 - */ 1126 - check_thread_timers(tsk, &firing); 1188 + do { 1189 + /* 1190 + * On RT locking sighand lock does not disable interrupts, 1191 + * so this needs to be careful vs. ticks. Store the current 1192 + * jiffies value. 1193 + */ 1194 + start = READ_ONCE(jiffies); 1195 + barrier(); 1127 1196 1128 - check_process_timers(tsk, &firing); 1197 + /* 1198 + * Here we take off tsk->signal->cpu_timers[N] and 1199 + * tsk->cpu_timers[N] all the timers that are firing, and 1200 + * put them on the firing list. 1201 + */ 1202 + check_thread_timers(tsk, &firing); 1203 + 1204 + check_process_timers(tsk, &firing); 1205 + 1206 + /* 1207 + * The above timer checks have updated the exipry cache and 1208 + * because nothing can have queued or modified timers after 1209 + * sighand lock was taken above it is guaranteed to be 1210 + * consistent. So the next timer interrupt fastpath check 1211 + * will find valid data. 1212 + * 1213 + * If timer expiry runs in the timer interrupt context then 1214 + * the loop is not relevant as timers will be directly 1215 + * expired in interrupt context. The stub function below 1216 + * returns always true which allows the compiler to 1217 + * optimize the loop out. 1218 + * 1219 + * If timer expiry is deferred to task work context then 1220 + * the following rules apply: 1221 + * 1222 + * - On !RT kernels no tick can have happened on this CPU 1223 + * after sighand lock was acquired because interrupts are 1224 + * disabled. So reenabling task work before dropping 1225 + * sighand lock and reenabling interrupts is race free. 1226 + * 1227 + * - On RT kernels ticks might have happened but the tick 1228 + * work ignored posix CPU timer handling because the 1229 + * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work 1230 + * must be done very carefully including a check whether 1231 + * ticks have happened since the start of the timer 1232 + * expiry checks. posix_cpu_timers_enable_work() takes 1233 + * care of that and eventually lets the expiry checks 1234 + * run again. 1235 + */ 1236 + } while (!posix_cpu_timers_enable_work(tsk, start)); 1129 1237 1130 1238 /* 1131 - * We must release these locks before taking any timer's lock. 1239 + * We must release sighand lock before taking any timer's lock. 1132 1240 * There is a potential race with timer deletion here, as the 1133 1241 * siglock now protects our private firing list. We have set 1134 1242 * the firing flag in each timer, so that a deletion attempt ··· 1266 1134 list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { 1267 1135 int cpu_firing; 1268 1136 1137 + /* 1138 + * spin_lock() is sufficient here even independent of the 1139 + * expiry context. If expiry happens in hard interrupt 1140 + * context it's obvious. For task work context it's safe 1141 + * because all other operations on timer::it_lock happen in 1142 + * task context (syscall or exit). 1143 + */ 1269 1144 spin_lock(&timer->it_lock); 1270 1145 list_del_init(&timer->it.cpu.elist); 1271 1146 cpu_firing = timer->it.cpu.firing; ··· 1286 1147 cpu_timer_fire(timer); 1287 1148 spin_unlock(&timer->it_lock); 1288 1149 } 1289 - lockdep_posixtimer_exit(); 1150 + } 1151 + 1152 + /* 1153 + * This is called from the timer interrupt handler. The irq handler has 1154 + * already updated our counts. We need to check if any timers fire now. 1155 + * Interrupts are disabled. 1156 + */ 1157 + void run_posix_cpu_timers(void) 1158 + { 1159 + struct task_struct *tsk = current; 1160 + 1161 + lockdep_assert_irqs_disabled(); 1162 + 1163 + /* 1164 + * If the actual expiry is deferred to task work context and the 1165 + * work is already scheduled there is no point to do anything here. 1166 + */ 1167 + if (posix_cpu_timers_work_scheduled(tsk)) 1168 + return; 1169 + 1170 + /* 1171 + * The fast path checks that there are no expired thread or thread 1172 + * group timers. If that's so, just return. 1173 + */ 1174 + if (!fastpath_timer_check(tsk)) 1175 + return; 1176 + 1177 + __run_posix_cpu_timers(tsk); 1290 1178 } 1291 1179 1292 1180 /*
+1
kernel/time/timer.c
··· 2017 2017 void __init init_timers(void) 2018 2018 { 2019 2019 init_timer_cpus(); 2020 + posix_cputimers_init_work(); 2020 2021 open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 2021 2022 } 2022 2023