Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

delayacct: track delays from IRQ/SOFTIRQ

Delay accounting does not track the delay of IRQ/SOFTIRQ. While
IRQ/SOFTIRQ could have obvious impact on some workloads productivity, such
as when workloads are running on system which is busy handling network
IRQ/SOFTIRQ.

Get the delay of IRQ/SOFTIRQ could help users to reduce such delay. Such
as setting interrupt affinity or task affinity, using kernel thread for
NAPI etc. This is inspired by "sched/psi: Add PSI_IRQ to track
IRQ/SOFTIRQ pressure"[1]. Also fix some code indent problems of older
code.

And update tools/accounting/getdelays.c:
/ # ./getdelays -p 156 -di
print delayacct stats ON
printing IO accounting
PID 156

CPU count real total virtual total delay total delay average
15 15836008 16218149 275700790 18.380ms
IO count delay total delay average
0 0 0.000ms
SWAP count delay total delay average
0 0 0.000ms
RECLAIM count delay total delay average
0 0 0.000ms
THRASHING count delay total delay average
0 0 0.000ms
COMPACT count delay total delay average
0 0 0.000ms
WPCOPY count delay total delay average
36 7586118 0.211ms
IRQ count delay total delay average
42 929161 0.022ms

[1] commit 52b1364ba0b1("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure")

Link: https://lkml.kernel.org/r/202304081728353557233@zte.com.cn
Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
Cc: Jiang Xuexin <jiang.xuexin@zte.com.cn>
Cc: wangyong <wang.yong12@zte.com.cn>
Cc: junhua huang <huang.junhua@zte.com.cn>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Yang Yang and committed by
Andrew Morton
a3b2aeac 29692fc9

+58 -15
+5 -2
Documentation/accounting/delay-accounting.rst
··· 16 16 e) thrashing 17 17 f) direct compact 18 18 g) write-protect copy 19 + h) IRQ/SOFTIRQ 19 20 20 21 and makes these statistics available to userspace through 21 22 the taskstats interface. ··· 50 49 for a description of the fields pertaining to delay accounting. 51 50 It will generally be in the form of counters returning the cumulative 52 51 delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page 53 - cache, direct compact, write-protect copy etc. 52 + cache, direct compact, write-protect copy, IRQ/SOFTIRQ etc. 54 53 55 54 Taking the difference of two successive readings of a given 56 55 counter (say cpu_delay_total) for a task will give the delay ··· 119 118 0 0 0.000ms 120 119 COMPACT count delay total delay average 121 120 0 0 0.000ms 122 - WPCOPY count delay total delay average 121 + WPCOPY count delay total delay average 122 + 0 0 0.000ms 123 + IRQ count delay total delay average 123 124 0 0 0.000ms 124 125 125 126 Get IO accounting for pid 1, it works only with -p::
+15
include/linux/delayacct.h
··· 48 48 u64 wpcopy_start; 49 49 u64 wpcopy_delay; /* wait for write-protect copy */ 50 50 51 + u64 irq_delay; /* wait for IRQ/SOFTIRQ */ 52 + 51 53 u32 freepages_count; /* total count of memory reclaim */ 52 54 u32 thrashing_count; /* total count of thrash waits */ 53 55 u32 compact_count; /* total count of memory compact */ 54 56 u32 wpcopy_count; /* total count of write-protect copy */ 57 + u32 irq_count; /* total count of IRQ/SOFTIRQ */ 55 58 }; 56 59 #endif 57 60 ··· 84 81 extern void __delayacct_compact_end(void); 85 82 extern void __delayacct_wpcopy_start(void); 86 83 extern void __delayacct_wpcopy_end(void); 84 + extern void __delayacct_irq(struct task_struct *task, u32 delta); 87 85 88 86 static inline void delayacct_tsk_init(struct task_struct *tsk) 89 87 { ··· 219 215 __delayacct_wpcopy_end(); 220 216 } 221 217 218 + static inline void delayacct_irq(struct task_struct *task, u32 delta) 219 + { 220 + if (!static_branch_unlikely(&delayacct_key)) 221 + return; 222 + 223 + if (task->delays) 224 + __delayacct_irq(task, delta); 225 + } 226 + 222 227 #else 223 228 static inline void delayacct_init(void) 224 229 {} ··· 265 252 static inline void delayacct_wpcopy_start(void) 266 253 {} 267 254 static inline void delayacct_wpcopy_end(void) 255 + {} 256 + static inline void delayacct_irq(struct task_struct *task, u32 delta) 268 257 {} 269 258 270 259 #endif /* CONFIG_TASK_DELAY_ACCT */
+5 -1
include/uapi/linux/taskstats.h
··· 34 34 */ 35 35 36 36 37 - #define TASKSTATS_VERSION 13 37 + #define TASKSTATS_VERSION 14 38 38 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN 39 39 * in linux/sched.h */ 40 40 ··· 198 198 /* v13: Delay waiting for write-protect copy */ 199 199 __u64 wpcopy_count; 200 200 __u64 wpcopy_delay_total; 201 + 202 + /* v14: Delay waiting for IRQ/SOFTIRQ */ 203 + __u64 irq_count; 204 + __u64 irq_delay_total; 201 205 }; 202 206 203 207
+14
kernel/delayacct.c
··· 179 179 d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; 180 180 tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay; 181 181 d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp; 182 + tmp = d->irq_delay_total + tsk->delays->irq_delay; 183 + d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp; 182 184 d->blkio_count += tsk->delays->blkio_count; 183 185 d->swapin_count += tsk->delays->swapin_count; 184 186 d->freepages_count += tsk->delays->freepages_count; 185 187 d->thrashing_count += tsk->delays->thrashing_count; 186 188 d->compact_count += tsk->delays->compact_count; 187 189 d->wpcopy_count += tsk->delays->wpcopy_count; 190 + d->irq_count += tsk->delays->irq_count; 188 191 raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); 189 192 190 193 return 0; ··· 277 274 &current->delays->wpcopy_delay, 278 275 &current->delays->wpcopy_count); 279 276 } 277 + 278 + void __delayacct_irq(struct task_struct *task, u32 delta) 279 + { 280 + unsigned long flags; 281 + 282 + raw_spin_lock_irqsave(&task->delays->lock, flags); 283 + task->delays->irq_delay += delta; 284 + task->delays->irq_count++; 285 + raw_spin_unlock_irqrestore(&task->delays->lock, flags); 286 + } 287 +
+1
kernel/sched/core.c
··· 704 704 rq->prev_irq_time += irq_delta; 705 705 delta -= irq_delta; 706 706 psi_account_irqtime(rq->curr, irq_delta); 707 + delayacct_irq(rq->curr, irq_delta); 707 708 #endif 708 709 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 709 710 if (static_key_false((&paravirt_steal_rq_enabled))) {
+18 -12
tools/accounting/getdelays.c
··· 198 198 printf("\n\nCPU %15s%15s%15s%15s%15s\n" 199 199 " %15llu%15llu%15llu%15llu%15.3fms\n" 200 200 "IO %15s%15s%15s\n" 201 - " %15llu%15llu%15.3fms\n" 201 + " %15llu%15llu%15.3fms\n" 202 202 "SWAP %15s%15s%15s\n" 203 - " %15llu%15llu%15.3fms\n" 203 + " %15llu%15llu%15.3fms\n" 204 204 "RECLAIM %12s%15s%15s\n" 205 - " %15llu%15llu%15.3fms\n" 205 + " %15llu%15llu%15.3fms\n" 206 206 "THRASHING%12s%15s%15s\n" 207 - " %15llu%15llu%15.3fms\n" 207 + " %15llu%15llu%15.3fms\n" 208 208 "COMPACT %12s%15s%15s\n" 209 - " %15llu%15llu%15.3fms\n" 209 + " %15llu%15llu%15.3fms\n" 210 210 "WPCOPY %12s%15s%15s\n" 211 - " %15llu%15llu%15.3fms\n", 211 + " %15llu%15llu%15.3fms\n" 212 + "IRQ %15s%15s%15s\n" 213 + " %15llu%15llu%15.3fms\n", 212 214 "count", "real total", "virtual total", 213 215 "delay total", "delay average", 214 216 (unsigned long long)t->cpu_count, ··· 221 219 "count", "delay total", "delay average", 222 220 (unsigned long long)t->blkio_count, 223 221 (unsigned long long)t->blkio_delay_total, 224 - average_ms((double)t->blkio_delay_total, t->blkio_count), 222 + average_ms((double)t->blkio_delay_total, t->blkio_count), 225 223 "count", "delay total", "delay average", 226 224 (unsigned long long)t->swapin_count, 227 225 (unsigned long long)t->swapin_delay_total, 228 - average_ms((double)t->swapin_delay_total, t->swapin_count), 226 + average_ms((double)t->swapin_delay_total, t->swapin_count), 229 227 "count", "delay total", "delay average", 230 228 (unsigned long long)t->freepages_count, 231 229 (unsigned long long)t->freepages_delay_total, 232 - average_ms((double)t->freepages_delay_total, t->freepages_count), 230 + average_ms((double)t->freepages_delay_total, t->freepages_count), 233 231 "count", "delay total", "delay average", 234 232 (unsigned long long)t->thrashing_count, 235 233 (unsigned long long)t->thrashing_delay_total, 236 - average_ms((double)t->thrashing_delay_total, t->thrashing_count), 234 + average_ms((double)t->thrashing_delay_total, t->thrashing_count), 237 235 "count", "delay total", "delay average", 238 236 (unsigned long long)t->compact_count, 239 237 (unsigned long long)t->compact_delay_total, 240 - average_ms((double)t->compact_delay_total, t->compact_count), 238 + average_ms((double)t->compact_delay_total, t->compact_count), 241 239 "count", "delay total", "delay average", 242 240 (unsigned long long)t->wpcopy_count, 243 241 (unsigned long long)t->wpcopy_delay_total, 244 - average_ms((double)t->wpcopy_delay_total, t->wpcopy_count)); 242 + average_ms((double)t->wpcopy_delay_total, t->wpcopy_count), 243 + "count", "delay total", "delay average", 244 + (unsigned long long)t->irq_count, 245 + (unsigned long long)t->irq_delay_total, 246 + average_ms((double)t->irq_delay_total, t->irq_count)); 245 247 } 246 248 247 249 static void task_context_switch_counts(struct taskstats *t)