Merge tag 'wq-for-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull workqueue updates from Tejun Heo:

- Concurrency-managed per-cpu work items that hog CPUs and delay the
execution of other work items are now automatically detected and
excluded from concurrency management. Reporting on such work items
can also be enabled through a config option.

- Added tools/workqueue/wq_monitor.py which improves visibility into
workqueue usages and behaviors.

- Arnd's minimal fix for gcc-13 enum warning on 32bit compiles,
superseded by commit afa4bb778e48 in mainline.

* tag 'wq-for-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq:
workqueue: Disable per-cpu CPU hog detection when wq_cpu_intensive_thresh_us is 0
workqueue: Fix WARN_ON_ONCE() triggers in worker_enter_idle()
workqueue: fix enum type for gcc-13
workqueue: Track and monitor per-workqueue CPU time usage
workqueue: Report work funcs that trigger automatic CPU_INTENSIVE mechanism
workqueue: Automatically mark CPU-hogging work items CPU_INTENSIVE
workqueue: Improve locking rule description for worker fields
workqueue: Move worker_set/clr_flags() upwards
workqueue: Re-order struct worker fields
workqueue: Add pwq->stats[] and a monitoring script
Further upgrade queue_work_on() comment

Linus Torvalds 2 years ago 7ab044a4 18eb3b6d

+542 -118

7 changed files

expand all

Documentation

admin-guide

kernel-parameters.txt

core-api

workqueue.rst

kernel

sched

core.c

workqueue.c

workqueue_internal.h

lib

Kconfig.debug

tools

workqueue

wq_monitor.py

+12

Documentation/admin-guide/kernel-parameters.txt

··· 6972 6972 it can be updated at runtime by writing to the 6973 6973 corresponding sysfs file. 6974 6974 6975 + workqueue.cpu_intensive_thresh_us= 6976 + Per-cpu work items which run for longer than this 6977 + threshold are automatically considered CPU intensive 6978 + and excluded from concurrency management to prevent 6979 + them from noticeably delaying other per-cpu work 6980 + items. Default is 10000 (10ms). 6981 + 6982 + If CONFIG_WQ_CPU_INTENSIVE_REPORT is set, the kernel 6983 + will report the work functions which violate this 6984 + threshold repeatedly. They are likely good 6985 + candidates for using WQ_UNBOUND workqueues instead. 6986 + 6975 6987 workqueue.disable_numa 6976 6988 By default, all work items queued to unbound 6977 6989 workqueues are affine to the NUMA nodes they're

+32

Documentation/core-api/workqueue.rst

··· 348 348 level of locality in wq operations and work item execution. 349 349 350 350 351 + Monitoring 352 + ========== 353 + 354 + Use tools/workqueue/wq_monitor.py to monitor workqueue operations: :: 355 + 356 + $ tools/workqueue/wq_monitor.py events 357 + total infl CPUtime CPUhog CMwake mayday rescued 358 + events 18545 0 6.1 0 5 - - 359 + events_highpri 8 0 0.0 0 0 - - 360 + events_long 3 0 0.0 0 0 - - 361 + events_unbound 38306 0 0.1 - - - - 362 + events_freezable 0 0 0.0 0 0 - - 363 + events_power_efficient 29598 0 0.2 0 0 - - 364 + events_freezable_power_ 10 0 0.0 0 0 - - 365 + sock_diag_events 0 0 0.0 0 0 - - 366 + 367 + total infl CPUtime CPUhog CMwake mayday rescued 368 + events 18548 0 6.1 0 5 - - 369 + events_highpri 8 0 0.0 0 0 - - 370 + events_long 3 0 0.0 0 0 - - 371 + events_unbound 38322 0 0.1 - - - - 372 + events_freezable 0 0 0.0 0 0 - - 373 + events_power_efficient 29603 0 0.2 0 0 - - 374 + events_freezable_power_ 10 0 0.0 0 0 - - 375 + sock_diag_events 0 0 0.0 0 0 - - 376 + 377 + ... 378 + 379 + See the command's help message for more info. 380 + 381 + 351 382 Debugging 352 383 ========= 353 384 ··· 417 386 418 387 The work item's function should be trivially visible in the stack 419 388 trace. 389 + 420 390 421 391 Non-reentrance Conditions 422 392 =========================

kernel/sched/core.c

··· 5670 5670 5671 5671 perf_event_task_tick(); 5672 5672 5673 + if (curr->flags & PF_WQ_WORKER) 5674 + wq_worker_tick(curr); 5675 + 5673 5676 #ifdef CONFIG_SMP 5674 5677 rq->idle_balance = idle_cpu(cpu); 5675 5678 trigger_load_balance(rq);

+301 -107

kernel/workqueue.c

··· 126 126 * cpu or grabbing pool->lock is enough for read access. If 127 127 * POOL_DISASSOCIATED is set, it's identical to L. 128 128 * 129 + * K: Only modified by worker while holding pool->lock. Can be safely read by 130 + * self, while holding pool->lock or from IRQ context if %current is the 131 + * kworker. 132 + * 133 + * S: Only modified by worker self. 134 + * 129 135 * A: wq_pool_attach_mutex protected. 130 136 * 131 137 * PL: wq_pool_mutex protected. ··· 206 200 }; 207 201 208 202 /* 203 + * Per-pool_workqueue statistics. These can be monitored using 204 + * tools/workqueue/wq_monitor.py. 205 + */ 206 + enum pool_workqueue_stats { 207 + PWQ_STAT_STARTED, /* work items started execution */ 208 + PWQ_STAT_COMPLETED, /* work items completed execution */ 209 + PWQ_STAT_CPU_TIME, /* total CPU time consumed */ 210 + PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ 211 + PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ 212 + PWQ_STAT_MAYDAY, /* maydays to rescuer */ 213 + PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ 214 + 215 + PWQ_NR_STATS, 216 + }; 217 + 218 + /* 209 219 * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS 210 220 * of work_struct->data are used for flags and the remaining high bits 211 221 * point to the pwq; thus, pwqs need to be aligned at two's power of the ··· 257 235 struct list_head inactive_works; /* L: inactive works */ 258 236 struct list_head pwqs_node; /* WR: node on wq->pwqs */ 259 237 struct list_head mayday_node; /* MD: node on wq->maydays */ 238 + 239 + u64 stats[PWQ_NR_STATS]; 260 240 261 241 /* 262 242 * Release of unbound pwq is punted to system_wq. See put_pwq() ··· 333 309 334 310 static cpumask_var_t *wq_numa_possible_cpumask; 335 311 /* possible CPUs of each node */ 312 + 313 + /* 314 + * Per-cpu work items which run for longer than the following threshold are 315 + * automatically considered CPU intensive and excluded from concurrency 316 + * management to prevent them from noticeably delaying other per-cpu work items. 317 + */ 318 + static unsigned long wq_cpu_intensive_thresh_us = 10000; 319 + module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); 336 320 337 321 static bool wq_disable_numa; 338 322 module_param_named(disable_numa, wq_disable_numa, bool, 0444); ··· 899 867 } 900 868 901 869 /** 902 - * wq_worker_running - a worker is running again 903 - * @task: task waking up 904 - * 905 - * This function is called when a worker returns from schedule() 906 - */ 907 - void wq_worker_running(struct task_struct *task) 908 - { 909 - struct worker *worker = kthread_data(task); 910 - 911 - if (!worker->sleeping) 912 - return; 913 - 914 - /* 915 - * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check 916 - * and the nr_running increment below, we may ruin the nr_running reset 917 - * and leave with an unexpected pool->nr_running == 1 on the newly unbound 918 - * pool. Protect against such race. 919 - */ 920 - preempt_disable(); 921 - if (!(worker->flags & WORKER_NOT_RUNNING)) 922 - worker->pool->nr_running++; 923 - preempt_enable(); 924 - worker->sleeping = 0; 925 - } 926 - 927 - /** 928 - * wq_worker_sleeping - a worker is going to sleep 929 - * @task: task going to sleep 930 - * 931 - * This function is called from schedule() when a busy worker is 932 - * going to sleep. 933 - */ 934 - void wq_worker_sleeping(struct task_struct *task) 935 - { 936 - struct worker *worker = kthread_data(task); 937 - struct worker_pool *pool; 938 - 939 - /* 940 - * Rescuers, which may not have all the fields set up like normal 941 - * workers, also reach here, let's not access anything before 942 - * checking NOT_RUNNING. 943 - */ 944 - if (worker->flags & WORKER_NOT_RUNNING) 945 - return; 946 - 947 - pool = worker->pool; 948 - 949 - /* Return if preempted before wq_worker_running() was reached */ 950 - if (worker->sleeping) 951 - return; 952 - 953 - worker->sleeping = 1; 954 - raw_spin_lock_irq(&pool->lock); 955 - 956 - /* 957 - * Recheck in case unbind_workers() preempted us. We don't 958 - * want to decrement nr_running after the worker is unbound 959 - * and nr_running has been reset. 960 - */ 961 - if (worker->flags & WORKER_NOT_RUNNING) { 962 - raw_spin_unlock_irq(&pool->lock); 963 - return; 964 - } 965 - 966 - pool->nr_running--; 967 - if (need_more_worker(pool)) 968 - wake_up_worker(pool); 969 - raw_spin_unlock_irq(&pool->lock); 970 - } 971 - 972 - /** 973 - * wq_worker_last_func - retrieve worker's last work function 974 - * @task: Task to retrieve last work function of. 975 - * 976 - * Determine the last function a worker executed. This is called from 977 - * the scheduler to get a worker's last known identity. 978 - * 979 - * CONTEXT: 980 - * raw_spin_lock_irq(rq->lock) 981 - * 982 - * This function is called during schedule() when a kworker is going 983 - * to sleep. It's used by psi to identify aggregation workers during 984 - * dequeuing, to allow periodic aggregation to shut-off when that 985 - * worker is the last task in the system or cgroup to go to sleep. 986 - * 987 - * As this function doesn't involve any workqueue-related locking, it 988 - * only returns stable values when called from inside the scheduler's 989 - * queuing and dequeuing paths, when @task, which must be a kworker, 990 - * is guaranteed to not be processing any works. 991 - * 992 - * Return: 993 - * The last work function %current executed as a worker, NULL if it 994 - * hasn't executed any work yet. 995 - */ 996 - work_func_t wq_worker_last_func(struct task_struct *task) 997 - { 998 - struct worker *worker = kthread_data(task); 999 - 1000 - return worker->last_func; 1001 - } 1002 - 1003 - /** 1004 870 * worker_set_flags - set worker flags and adjust nr_running accordingly 1005 871 * @worker: self 1006 872 * @flags: flags to set ··· 950 1020 if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) 951 1021 if (!(worker->flags & WORKER_NOT_RUNNING)) 952 1022 pool->nr_running++; 1023 + } 1024 + 1025 + #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT 1026 + 1027 + /* 1028 + * Concurrency-managed per-cpu work items that hog CPU for longer than 1029 + * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism, 1030 + * which prevents them from stalling other concurrency-managed work items. If a 1031 + * work function keeps triggering this mechanism, it's likely that the work item 1032 + * should be using an unbound workqueue instead. 1033 + * 1034 + * wq_cpu_intensive_report() tracks work functions which trigger such conditions 1035 + * and report them so that they can be examined and converted to use unbound 1036 + * workqueues as appropriate. To avoid flooding the console, each violating work 1037 + * function is tracked and reported with exponential backoff. 1038 + */ 1039 + #define WCI_MAX_ENTS 128 1040 + 1041 + struct wci_ent { 1042 + work_func_t func; 1043 + atomic64_t cnt; 1044 + struct hlist_node hash_node; 1045 + }; 1046 + 1047 + static struct wci_ent wci_ents[WCI_MAX_ENTS]; 1048 + static int wci_nr_ents; 1049 + static DEFINE_RAW_SPINLOCK(wci_lock); 1050 + static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS)); 1051 + 1052 + static struct wci_ent *wci_find_ent(work_func_t func) 1053 + { 1054 + struct wci_ent *ent; 1055 + 1056 + hash_for_each_possible_rcu(wci_hash, ent, hash_node, 1057 + (unsigned long)func) { 1058 + if (ent->func == func) 1059 + return ent; 1060 + } 1061 + return NULL; 1062 + } 1063 + 1064 + static void wq_cpu_intensive_report(work_func_t func) 1065 + { 1066 + struct wci_ent *ent; 1067 + 1068 + restart: 1069 + ent = wci_find_ent(func); 1070 + if (ent) { 1071 + u64 cnt; 1072 + 1073 + /* 1074 + * Start reporting from the fourth time and back off 1075 + * exponentially. 1076 + */ 1077 + cnt = atomic64_inc_return_relaxed(&ent->cnt); 1078 + if (cnt >= 4 && is_power_of_2(cnt)) 1079 + printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", 1080 + ent->func, wq_cpu_intensive_thresh_us, 1081 + atomic64_read(&ent->cnt)); 1082 + return; 1083 + } 1084 + 1085 + /* 1086 + * @func is a new violation. Allocate a new entry for it. If wcn_ents[] 1087 + * is exhausted, something went really wrong and we probably made enough 1088 + * noise already. 1089 + */ 1090 + if (wci_nr_ents >= WCI_MAX_ENTS) 1091 + return; 1092 + 1093 + raw_spin_lock(&wci_lock); 1094 + 1095 + if (wci_nr_ents >= WCI_MAX_ENTS) { 1096 + raw_spin_unlock(&wci_lock); 1097 + return; 1098 + } 1099 + 1100 + if (wci_find_ent(func)) { 1101 + raw_spin_unlock(&wci_lock); 1102 + goto restart; 1103 + } 1104 + 1105 + ent = &wci_ents[wci_nr_ents++]; 1106 + ent->func = func; 1107 + atomic64_set(&ent->cnt, 1); 1108 + hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); 1109 + 1110 + raw_spin_unlock(&wci_lock); 1111 + } 1112 + 1113 + #else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ 1114 + static void wq_cpu_intensive_report(work_func_t func) {} 1115 + #endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ 1116 + 1117 + /** 1118 + * wq_worker_running - a worker is running again 1119 + * @task: task waking up 1120 + * 1121 + * This function is called when a worker returns from schedule() 1122 + */ 1123 + void wq_worker_running(struct task_struct *task) 1124 + { 1125 + struct worker *worker = kthread_data(task); 1126 + 1127 + if (!READ_ONCE(worker->sleeping)) 1128 + return; 1129 + 1130 + /* 1131 + * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check 1132 + * and the nr_running increment below, we may ruin the nr_running reset 1133 + * and leave with an unexpected pool->nr_running == 1 on the newly unbound 1134 + * pool. Protect against such race. 1135 + */ 1136 + preempt_disable(); 1137 + if (!(worker->flags & WORKER_NOT_RUNNING)) 1138 + worker->pool->nr_running++; 1139 + preempt_enable(); 1140 + 1141 + /* 1142 + * CPU intensive auto-detection cares about how long a work item hogged 1143 + * CPU without sleeping. Reset the starting timestamp on wakeup. 1144 + */ 1145 + worker->current_at = worker->task->se.sum_exec_runtime; 1146 + 1147 + WRITE_ONCE(worker->sleeping, 0); 1148 + } 1149 + 1150 + /** 1151 + * wq_worker_sleeping - a worker is going to sleep 1152 + * @task: task going to sleep 1153 + * 1154 + * This function is called from schedule() when a busy worker is 1155 + * going to sleep. 1156 + */ 1157 + void wq_worker_sleeping(struct task_struct *task) 1158 + { 1159 + struct worker *worker = kthread_data(task); 1160 + struct worker_pool *pool; 1161 + 1162 + /* 1163 + * Rescuers, which may not have all the fields set up like normal 1164 + * workers, also reach here, let's not access anything before 1165 + * checking NOT_RUNNING. 1166 + */ 1167 + if (worker->flags & WORKER_NOT_RUNNING) 1168 + return; 1169 + 1170 + pool = worker->pool; 1171 + 1172 + /* Return if preempted before wq_worker_running() was reached */ 1173 + if (READ_ONCE(worker->sleeping)) 1174 + return; 1175 + 1176 + WRITE_ONCE(worker->sleeping, 1); 1177 + raw_spin_lock_irq(&pool->lock); 1178 + 1179 + /* 1180 + * Recheck in case unbind_workers() preempted us. We don't 1181 + * want to decrement nr_running after the worker is unbound 1182 + * and nr_running has been reset. 1183 + */ 1184 + if (worker->flags & WORKER_NOT_RUNNING) { 1185 + raw_spin_unlock_irq(&pool->lock); 1186 + return; 1187 + } 1188 + 1189 + pool->nr_running--; 1190 + if (need_more_worker(pool)) { 1191 + worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++; 1192 + wake_up_worker(pool); 1193 + } 1194 + raw_spin_unlock_irq(&pool->lock); 1195 + } 1196 + 1197 + /** 1198 + * wq_worker_tick - a scheduler tick occurred while a kworker is running 1199 + * @task: task currently running 1200 + * 1201 + * Called from scheduler_tick(). We're in the IRQ context and the current 1202 + * worker's fields which follow the 'K' locking rule can be accessed safely. 1203 + */ 1204 + void wq_worker_tick(struct task_struct *task) 1205 + { 1206 + struct worker *worker = kthread_data(task); 1207 + struct pool_workqueue *pwq = worker->current_pwq; 1208 + struct worker_pool *pool = worker->pool; 1209 + 1210 + if (!pwq) 1211 + return; 1212 + 1213 + pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC; 1214 + 1215 + if (!wq_cpu_intensive_thresh_us) 1216 + return; 1217 + 1218 + /* 1219 + * If the current worker is concurrency managed and hogged the CPU for 1220 + * longer than wq_cpu_intensive_thresh_us, it's automatically marked 1221 + * CPU_INTENSIVE to avoid stalling other concurrency-managed work items. 1222 + * 1223 + * Set @worker->sleeping means that @worker is in the process of 1224 + * switching out voluntarily and won't be contributing to 1225 + * @pool->nr_running until it wakes up. As wq_worker_sleeping() also 1226 + * decrements ->nr_running, setting CPU_INTENSIVE here can lead to 1227 + * double decrements. The task is releasing the CPU anyway. Let's skip. 1228 + * We probably want to make this prettier in the future. 1229 + */ 1230 + if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) || 1231 + worker->task->se.sum_exec_runtime - worker->current_at < 1232 + wq_cpu_intensive_thresh_us * NSEC_PER_USEC) 1233 + return; 1234 + 1235 + raw_spin_lock(&pool->lock); 1236 + 1237 + worker_set_flags(worker, WORKER_CPU_INTENSIVE); 1238 + wq_cpu_intensive_report(worker->current_func); 1239 + pwq->stats[PWQ_STAT_CPU_INTENSIVE]++; 1240 + 1241 + if (need_more_worker(pool)) { 1242 + pwq->stats[PWQ_STAT_CM_WAKEUP]++; 1243 + wake_up_worker(pool); 1244 + } 1245 + 1246 + raw_spin_unlock(&pool->lock); 1247 + } 1248 + 1249 + /** 1250 + * wq_worker_last_func - retrieve worker's last work function 1251 + * @task: Task to retrieve last work function of. 1252 + * 1253 + * Determine the last function a worker executed. This is called from 1254 + * the scheduler to get a worker's last known identity. 1255 + * 1256 + * CONTEXT: 1257 + * raw_spin_lock_irq(rq->lock) 1258 + * 1259 + * This function is called during schedule() when a kworker is going 1260 + * to sleep. It's used by psi to identify aggregation workers during 1261 + * dequeuing, to allow periodic aggregation to shut-off when that 1262 + * worker is the last task in the system or cgroup to go to sleep. 1263 + * 1264 + * As this function doesn't involve any workqueue-related locking, it 1265 + * only returns stable values when called from inside the scheduler's 1266 + * queuing and dequeuing paths, when @task, which must be a kworker, 1267 + * is guaranteed to not be processing any works. 1268 + * 1269 + * Return: 1270 + * The last work function %current executed as a worker, NULL if it 1271 + * hasn't executed any work yet. 1272 + */ 1273 + work_func_t wq_worker_last_func(struct task_struct *task) 1274 + { 1275 + struct worker *worker = kthread_data(task); 1276 + 1277 + return worker->last_func; 953 1278 } 954 1279 955 1280 /** ··· 1727 1542 * We queue the work to a specific CPU, the caller must ensure it 1728 1543 * can't go away. Callers that fail to ensure that the specified 1729 1544 * CPU cannot go away will execute on a randomly chosen CPU. 1545 + * But note well that callers specifying a CPU that never has been 1546 + * online will get a splat. 1730 1547 * 1731 1548 * Return: %false if @work was already on a queue, %true otherwise. 1732 1549 */ ··· 2353 2166 get_pwq(pwq); 2354 2167 list_add_tail(&pwq->mayday_node, &wq->maydays); 2355 2168 wake_up_process(wq->rescuer->task); 2169 + pwq->stats[PWQ_STAT_MAYDAY]++; 2356 2170 } 2357 2171 } 2358 2172 ··· 2491 2303 { 2492 2304 struct pool_workqueue *pwq = get_work_pwq(work); 2493 2305 struct worker_pool *pool = worker->pool; 2494 - bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE; 2495 2306 unsigned long work_data; 2496 2307 struct worker *collision; 2497 2308 #ifdef CONFIG_LOCKDEP ··· 2527 2340 worker->current_work = work; 2528 2341 worker->current_func = work->func; 2529 2342 worker->current_pwq = pwq; 2343 + worker->current_at = worker->task->se.sum_exec_runtime; 2530 2344 work_data = *work_data_bits(work); 2531 2345 worker->current_color = get_work_color(work_data); 2532 2346 ··· 2545 2357 * of concurrency management and the next code block will chain 2546 2358 * execution of the pending work items. 2547 2359 */ 2548 - if (unlikely(cpu_intensive)) 2360 + if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE)) 2549 2361 worker_set_flags(worker, WORKER_CPU_INTENSIVE); 2550 2362 2551 2363 /* ··· 2592 2404 * workqueues), so hiding them isn't a problem. 2593 2405 */ 2594 2406 lockdep_invariant_state(true); 2407 + pwq->stats[PWQ_STAT_STARTED]++; 2595 2408 trace_workqueue_execute_start(work); 2596 2409 worker->current_func(work); 2597 2410 /* ··· 2600 2411 * point will only record its address. 2601 2412 */ 2602 2413 trace_workqueue_execute_end(work, worker->current_func); 2414 + pwq->stats[PWQ_STAT_COMPLETED]++; 2603 2415 lock_map_release(&lockdep_map); 2604 2416 lock_map_release(&pwq->wq->lockdep_map); 2605 2417 ··· 2625 2435 2626 2436 raw_spin_lock_irq(&pool->lock); 2627 2437 2628 - /* clear cpu intensive status */ 2629 - if (unlikely(cpu_intensive)) 2630 - worker_clr_flags(worker, WORKER_CPU_INTENSIVE); 2438 + /* 2439 + * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked 2440 + * CPU intensive by wq_worker_tick() if @work hogged CPU longer than 2441 + * wq_cpu_intensive_thresh_us. Clear it. 2442 + */ 2443 + worker_clr_flags(worker, WORKER_CPU_INTENSIVE); 2631 2444 2632 2445 /* tag the worker for identification in schedule() */ 2633 2446 worker->last_func = worker->current_func; ··· 2847 2654 if (first) 2848 2655 pool->watchdog_ts = jiffies; 2849 2656 move_linked_works(work, scheduled, &n); 2657 + pwq->stats[PWQ_STAT_RESCUED]++; 2850 2658 } 2851 2659 first = false; 2852 2660 }

+13 -11

kernel/workqueue_internal.h

··· 28 28 struct hlist_node hentry; /* L: while busy */ 29 29 }; 30 30 31 - struct work_struct *current_work; /* L: work being processed */ 32 - work_func_t current_func; /* L: current_work's fn */ 33 - struct pool_workqueue *current_pwq; /* L: current_work's pwq */ 34 - unsigned int current_color; /* L: current_work's color */ 35 - struct list_head scheduled; /* L: scheduled works */ 31 + struct work_struct *current_work; /* K: work being processed and its */ 32 + work_func_t current_func; /* K: function */ 33 + struct pool_workqueue *current_pwq; /* K: pwq */ 34 + u64 current_at; /* K: runtime at start or last wakeup */ 35 + unsigned int current_color; /* K: color */ 36 36 37 - /* 64 bytes boundary on 64bit, 32 on 32bit */ 37 + int sleeping; /* S: is worker sleeping? */ 38 + 39 + /* used by the scheduler to determine a worker's last known identity */ 40 + work_func_t last_func; /* K: last work's fn */ 41 + 42 + struct list_head scheduled; /* L: scheduled works */ 38 43 39 44 struct task_struct *task; /* I: worker task */ 40 45 struct worker_pool *pool; /* A: the associated pool */ ··· 47 42 struct list_head node; /* A: anchored at pool->workers */ 48 43 /* A: runs through worker->node */ 49 44 50 - unsigned long last_active; /* L: last active timestamp */ 45 + unsigned long last_active; /* K: last active timestamp */ 51 46 unsigned int flags; /* X: flags */ 52 47 int id; /* I: worker id */ 53 - int sleeping; /* None */ 54 48 55 49 /* 56 50 * Opaque string set with work_set_desc(). Printed out with task ··· 59 55 60 56 /* used only by rescuers to point to the target workqueue */ 61 57 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ 62 - 63 - /* used by the scheduler to determine a worker's last known identity */ 64 - work_func_t last_func; 65 58 }; 66 59 67 60 /** ··· 77 76 */ 78 77 void wq_worker_running(struct task_struct *task); 79 78 void wq_worker_sleeping(struct task_struct *task); 79 + void wq_worker_tick(struct task_struct *task); 80 80 work_func_t wq_worker_last_func(struct task_struct *task); 81 81 82 82 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */

+13

lib/Kconfig.debug

··· 1134 1134 state. This can be configured through kernel parameter 1135 1135 "workqueue.watchdog_thresh" and its sysfs counterpart. 1136 1136 1137 + config WQ_CPU_INTENSIVE_REPORT 1138 + bool "Report per-cpu work items which hog CPU for too long" 1139 + depends on DEBUG_KERNEL 1140 + help 1141 + Say Y here to enable reporting of concurrency-managed per-cpu work 1142 + items that hog CPUs for longer than 1143 + workqueue.cpu_intensive_threshold_us. Workqueue automatically 1144 + detects and excludes them from concurrency management to prevent 1145 + them from stalling other per-cpu work items. Occassional 1146 + triggering may not necessarily indicate a problem. Repeated 1147 + triggering likely indicates that the work item should be switched 1148 + to use an unbound workqueue. 1149 + 1137 1150 config TEST_LOCKUP 1138 1151 tristate "Test module to generate lockups" 1139 1152 depends on m

+168

tools/workqueue/wq_monitor.py

··· 1 + #!/usr/bin/env drgn 2 + # 3 + # Copyright (C) 2023 Tejun Heo <tj@kernel.org> 4 + # Copyright (C) 2023 Meta Platforms, Inc. and affiliates. 5 + 6 + desc = """ 7 + This is a drgn script to monitor workqueues. For more info on drgn, visit 8 + https://github.com/osandov/drgn. 9 + 10 + total Total number of work items executed by the workqueue. 11 + 12 + infl The number of currently in-flight work items. 13 + 14 + CPUtime Total CPU time consumed by the workqueue in seconds. This is 15 + sampled from scheduler ticks and only provides ballpark 16 + measurement. "nohz_full=" CPUs are excluded from measurement. 17 + 18 + CPUitsv The number of times a concurrency-managed work item hogged CPU 19 + longer than the threshold (workqueue.cpu_intensive_thresh_us) 20 + and got excluded from concurrency management to avoid stalling 21 + other work items. 22 + 23 + CMwake The number of concurrency-management wake-ups while executing a 24 + work item of the workqueue. 25 + 26 + mayday The number of times the rescuer was requested while waiting for 27 + new worker creation. 28 + 29 + rescued The number of work items executed by the rescuer. 30 + """ 31 + 32 + import sys 33 + import signal 34 + import os 35 + import re 36 + import time 37 + import json 38 + 39 + import drgn 40 + from drgn.helpers.linux.list import list_for_each_entry,list_empty 41 + from drgn.helpers.linux.cpumask import for_each_possible_cpu 42 + 43 + import argparse 44 + parser = argparse.ArgumentParser(description=desc, 45 + formatter_class=argparse.RawTextHelpFormatter) 46 + parser.add_argument('workqueue', metavar='REGEX', nargs='*', 47 + help='Target workqueue name patterns (all if empty)') 48 + parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1, 49 + help='Monitoring interval (0 to print once and exit)') 50 + parser.add_argument('-j', '--json', action='store_true', 51 + help='Output in json') 52 + args = parser.parse_args() 53 + 54 + def err(s): 55 + print(s, file=sys.stderr, flush=True) 56 + sys.exit(1) 57 + 58 + workqueues = prog['workqueues'] 59 + 60 + WQ_UNBOUND = prog['WQ_UNBOUND'] 61 + WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM'] 62 + 63 + PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution 64 + PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution 65 + PWQ_STAT_CPU_TIME = prog['PWQ_STAT_CPU_TIME'] # total CPU time consumed 66 + PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations 67 + PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups 68 + PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer 69 + PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer 70 + PWQ_NR_STATS = prog['PWQ_NR_STATS'] 71 + 72 + class WqStats: 73 + def __init__(self, wq): 74 + self.name = wq.name.string_().decode() 75 + self.unbound = wq.flags & WQ_UNBOUND != 0 76 + self.mem_reclaim = wq.flags & WQ_MEM_RECLAIM != 0 77 + self.stats = [0] * PWQ_NR_STATS 78 + for pwq in list_for_each_entry('struct pool_workqueue', wq.pwqs.address_of_(), 'pwqs_node'): 79 + for i in range(PWQ_NR_STATS): 80 + self.stats[i] += int(pwq.stats[i]) 81 + 82 + def dict(self, now): 83 + return { 'timestamp' : now, 84 + 'name' : self.name, 85 + 'unbound' : self.unbound, 86 + 'mem_reclaim' : self.mem_reclaim, 87 + 'started' : self.stats[PWQ_STAT_STARTED], 88 + 'completed' : self.stats[PWQ_STAT_COMPLETED], 89 + 'cpu_time' : self.stats[PWQ_STAT_CPU_TIME], 90 + 'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE], 91 + 'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP], 92 + 'mayday' : self.stats[PWQ_STAT_MAYDAY], 93 + 'rescued' : self.stats[PWQ_STAT_RESCUED], } 94 + 95 + def table_header_str(): 96 + return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\ 97 + f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}' 98 + 99 + def table_row_str(self): 100 + cpu_intensive = '-' 101 + cm_wakeup = '-' 102 + mayday = '-' 103 + rescued = '-' 104 + 105 + if not self.unbound: 106 + cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE]) 107 + cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP]) 108 + 109 + if self.mem_reclaim: 110 + mayday = str(self.stats[PWQ_STAT_MAYDAY]) 111 + rescued = str(self.stats[PWQ_STAT_RESCUED]) 112 + 113 + out = f'{self.name[-24:]:24} ' \ 114 + f'{self.stats[PWQ_STAT_STARTED]:8} ' \ 115 + f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \ 116 + f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \ 117 + f'{cpu_intensive:>7} ' \ 118 + f'{cm_wakeup:>7} ' \ 119 + f'{mayday:>7} ' \ 120 + f'{rescued:>7} ' 121 + return out.rstrip(':') 122 + 123 + exit_req = False 124 + 125 + def sigint_handler(signr, frame): 126 + global exit_req 127 + exit_req = True 128 + 129 + def main(): 130 + # handle args 131 + table_fmt = not args.json 132 + interval = args.interval 133 + 134 + re_str = None 135 + if args.workqueue: 136 + for r in args.workqueue: 137 + if re_str is None: 138 + re_str = r 139 + else: 140 + re_str += '|' + r 141 + 142 + filter_re = re.compile(re_str) if re_str else None 143 + 144 + # monitoring loop 145 + signal.signal(signal.SIGINT, sigint_handler) 146 + 147 + while not exit_req: 148 + now = time.time() 149 + 150 + if table_fmt: 151 + print() 152 + print(WqStats.table_header_str()) 153 + 154 + for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'): 155 + stats = WqStats(wq) 156 + if filter_re and not filter_re.search(stats.name): 157 + continue 158 + if table_fmt: 159 + print(stats.table_row_str()) 160 + else: 161 + print(stats.dict(now)) 162 + 163 + if interval == 0: 164 + break 165 + time.sleep(interval) 166 + 167 + if __name__ == "__main__": 168 + main()