Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

trace/hwlat: Implement the per-cpu mode

Implements the per-cpu mode in which a sampling thread is created for
each cpu in the "cpus" (and tracing_mask).

The per-cpu mode has the potention to speed up the hwlat detection by
running on multiple CPUs at the same time, at the cost of higher cpu
usage with irqs disabled. Use with care.

[
Changed get_cpu_data() to static.
Reported-by: kernel test robot <lkp@intel.com>
]

Link: https://lkml.kernel.org/r/ec06d0ab340e8460d293772faba19ad8a5c371aa.1624372313.git.bristot@redhat.com

Cc: Phil Auld <pauld@redhat.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Kate Carcia <kcarcia@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexandre Chartre <alexandre.chartre@oracle.com>
Cc: Clark Willaims <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

authored by

Daniel Bristot de Oliveira and committed by
Steven Rostedt (VMware)
f46b1652 7bb7d802

+152 -37
+2 -1
Documentation/trace/hwlat_detector.rst
··· 78 78 - hwlat_detector/window - amount of time between (width) runs (usecs) 79 79 - hwlat_detector/mode - the thread mode 80 80 81 - By default, the hwlat detector's kernel thread will migrate across each CPU 81 + By default, one hwlat detector's kernel thread will migrate across each CPU 82 82 specified in cpumask at the beginning of a new window, in a round-robin 83 83 fashion. This behavior can be changed by changing the thread mode, 84 84 the available options are: 85 85 86 86 - none: do not force migration 87 87 - round-robin: migrate across each CPU specified in cpumask [default] 88 + - per-cpu: create one thread for each cpu in tracing_cpumask
+150 -36
kernel/trace/trace_hwlat.c
··· 54 54 #define DEFAULT_SAMPLE_WIDTH 500000 /* 0.5s */ 55 55 #define DEFAULT_LAT_THRESHOLD 10 /* 10us */ 56 56 57 - /* sampling thread*/ 58 - static struct task_struct *hwlat_kthread; 59 - 60 57 static struct dentry *hwlat_sample_width; /* sample width us */ 61 58 static struct dentry *hwlat_sample_window; /* sample window us */ 62 59 static struct dentry *hwlat_thread_mode; /* hwlat thread mode */ ··· 61 64 enum { 62 65 MODE_NONE = 0, 63 66 MODE_ROUND_ROBIN, 67 + MODE_PER_CPU, 64 68 MODE_MAX 65 69 }; 66 - static char *thread_mode_str[] = { "none", "round-robin" }; 70 + static char *thread_mode_str[] = { "none", "round-robin", "per-cpu" }; 67 71 68 72 /* Save the previous tracing_thresh value */ 69 73 static unsigned long save_tracing_thresh; 70 74 71 - /* NMI timestamp counters */ 72 - static u64 nmi_ts_start; 73 - static u64 nmi_total_ts; 74 - static int nmi_count; 75 - static int nmi_cpu; 75 + /* runtime kthread data */ 76 + struct hwlat_kthread_data { 77 + struct task_struct *kthread; 78 + /* NMI timestamp counters */ 79 + u64 nmi_ts_start; 80 + u64 nmi_total_ts; 81 + int nmi_count; 82 + int nmi_cpu; 83 + }; 84 + 85 + struct hwlat_kthread_data hwlat_single_cpu_data; 86 + DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data); 76 87 77 88 /* Tells NMIs to call back to the hwlat tracer to record timestamps */ 78 89 bool trace_hwlat_callback_enabled; ··· 116 111 .sample_width = DEFAULT_SAMPLE_WIDTH, 117 112 .thread_mode = MODE_ROUND_ROBIN 118 113 }; 114 + 115 + static struct hwlat_kthread_data *get_cpu_data(void) 116 + { 117 + if (hwlat_data.thread_mode == MODE_PER_CPU) 118 + return this_cpu_ptr(&hwlat_per_cpu_data); 119 + else 120 + return &hwlat_single_cpu_data; 121 + } 119 122 120 123 static bool hwlat_busy; 121 124 ··· 162 149 163 150 void trace_hwlat_callback(bool enter) 164 151 { 165 - if (smp_processor_id() != nmi_cpu) 152 + struct hwlat_kthread_data *kdata = get_cpu_data(); 153 + 154 + if (!kdata->kthread) 166 155 return; 167 156 168 157 /* ··· 173 158 */ 174 159 if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) { 175 160 if (enter) 176 - nmi_ts_start = time_get(); 161 + kdata->nmi_ts_start = time_get(); 177 162 else 178 - nmi_total_ts += time_get() - nmi_ts_start; 163 + kdata->nmi_total_ts += time_get() - kdata->nmi_ts_start; 179 164 } 180 165 181 166 if (enter) 182 - nmi_count++; 167 + kdata->nmi_count++; 183 168 } 184 169 185 170 /** ··· 191 176 */ 192 177 static int get_sample(void) 193 178 { 179 + struct hwlat_kthread_data *kdata = get_cpu_data(); 194 180 struct trace_array *tr = hwlat_trace; 195 181 struct hwlat_sample s; 196 182 time_type start, t1, t2, last_t2; ··· 204 188 205 189 do_div(thresh, NSEC_PER_USEC); /* modifies interval value */ 206 190 207 - nmi_cpu = smp_processor_id(); 208 - nmi_total_ts = 0; 209 - nmi_count = 0; 191 + kdata->nmi_total_ts = 0; 192 + kdata->nmi_count = 0; 210 193 /* Make sure NMIs see this first */ 211 194 barrier(); 212 195 ··· 275 260 ret = 1; 276 261 277 262 /* We read in microseconds */ 278 - if (nmi_total_ts) 279 - do_div(nmi_total_ts, NSEC_PER_USEC); 263 + if (kdata->nmi_total_ts) 264 + do_div(kdata->nmi_total_ts, NSEC_PER_USEC); 280 265 281 266 hwlat_data.count++; 282 267 s.seqnum = hwlat_data.count; 283 268 s.duration = sample; 284 269 s.outer_duration = outer_sample; 285 - s.nmi_total_ts = nmi_total_ts; 286 - s.nmi_count = nmi_count; 270 + s.nmi_total_ts = kdata->nmi_total_ts; 271 + s.nmi_count = kdata->nmi_count; 287 272 s.count = count; 288 273 trace_hwlat_sample(&s); 289 274 ··· 379 364 } 380 365 381 366 /* 382 - * start_kthread - Kick off the hardware latency sampling/detector kthread 367 + * stop_stop_kthread - Inform the hardware latency sampling/detector kthread to stop 368 + * 369 + * This kicks the running hardware latency sampling/detector kernel thread and 370 + * tells it to stop sampling now. Use this on unload and at system shutdown. 371 + */ 372 + static void stop_single_kthread(void) 373 + { 374 + struct hwlat_kthread_data *kdata = get_cpu_data(); 375 + struct task_struct *kthread = kdata->kthread; 376 + 377 + if (!kthread) 378 + return; 379 + 380 + kthread_stop(kthread); 381 + kdata->kthread = NULL; 382 + } 383 + 384 + 385 + /* 386 + * start_single_kthread - Kick off the hardware latency sampling/detector kthread 383 387 * 384 388 * This starts the kernel thread that will sit and sample the CPU timestamp 385 389 * counter (TSC or similar) and look for potential hardware latencies. 386 390 */ 387 - static int start_kthread(struct trace_array *tr) 391 + static int start_single_kthread(struct trace_array *tr) 388 392 { 393 + struct hwlat_kthread_data *kdata = get_cpu_data(); 389 394 struct cpumask *current_mask = &save_cpumask; 390 395 struct task_struct *kthread; 391 396 int next_cpu; 392 397 393 - if (hwlat_kthread) 398 + if (kdata->kthread) 394 399 return 0; 395 - 396 400 397 401 kthread = kthread_create(kthread_fn, NULL, "hwlatd"); 398 402 if (IS_ERR(kthread)) { ··· 434 400 435 401 sched_setaffinity(kthread->pid, current_mask); 436 402 437 - hwlat_kthread = kthread; 403 + kdata->kthread = kthread; 438 404 wake_up_process(kthread); 439 405 440 406 return 0; 441 407 } 442 408 443 409 /* 444 - * stop_kthread - Inform the hardware latency sampling/detector kthread to stop 410 + * stop_cpu_kthread - Stop a hwlat cpu kthread 411 + */ 412 + static void stop_cpu_kthread(unsigned int cpu) 413 + { 414 + struct task_struct *kthread; 415 + 416 + kthread = per_cpu(hwlat_per_cpu_data, cpu).kthread; 417 + if (kthread) 418 + kthread_stop(kthread); 419 + } 420 + 421 + /* 422 + * stop_per_cpu_kthreads - Inform the hardware latency sampling/detector kthread to stop 445 423 * 446 - * This kicks the running hardware latency sampling/detector kernel thread and 424 + * This kicks the running hardware latency sampling/detector kernel threads and 447 425 * tells it to stop sampling now. Use this on unload and at system shutdown. 448 426 */ 449 - static void stop_kthread(void) 427 + static void stop_per_cpu_kthreads(void) 450 428 { 451 - if (!hwlat_kthread) 452 - return; 453 - kthread_stop(hwlat_kthread); 454 - hwlat_kthread = NULL; 429 + unsigned int cpu; 430 + 431 + get_online_cpus(); 432 + for_each_online_cpu(cpu) 433 + stop_cpu_kthread(cpu); 434 + put_online_cpus(); 435 + } 436 + 437 + /* 438 + * start_cpu_kthread - Start a hwlat cpu kthread 439 + */ 440 + static int start_cpu_kthread(unsigned int cpu) 441 + { 442 + struct task_struct *kthread; 443 + char comm[24]; 444 + 445 + snprintf(comm, 24, "hwlatd/%d", cpu); 446 + 447 + kthread = kthread_create_on_cpu(kthread_fn, NULL, cpu, comm); 448 + if (IS_ERR(kthread)) { 449 + pr_err(BANNER "could not start sampling thread\n"); 450 + return -ENOMEM; 451 + } 452 + 453 + per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread; 454 + wake_up_process(kthread); 455 + 456 + return 0; 457 + } 458 + 459 + /* 460 + * start_per_cpu_kthreads - Kick off the hardware latency sampling/detector kthreads 461 + * 462 + * This starts the kernel threads that will sit on potentially all cpus and 463 + * sample the CPU timestamp counter (TSC or similar) and look for potential 464 + * hardware latencies. 465 + */ 466 + static int start_per_cpu_kthreads(struct trace_array *tr) 467 + { 468 + struct cpumask *current_mask = &save_cpumask; 469 + unsigned int cpu; 470 + int retval; 471 + 472 + get_online_cpus(); 473 + /* 474 + * Run only on CPUs in which hwlat is allowed to run. 475 + */ 476 + cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); 477 + 478 + for_each_online_cpu(cpu) 479 + per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL; 480 + 481 + for_each_cpu(cpu, current_mask) { 482 + retval = start_cpu_kthread(cpu); 483 + if (retval) 484 + goto out_error; 485 + } 486 + put_online_cpus(); 487 + 488 + return 0; 489 + 490 + out_error: 491 + put_online_cpus(); 492 + stop_per_cpu_kthreads(); 493 + return retval; 455 494 } 456 495 457 496 /* ··· 707 600 * The "none" sets the allowed cpumask for a single hwlatd thread at the 708 601 * startup and lets the scheduler handle the migration. The default mode is 709 602 * the "round-robin" one, in which a single hwlatd thread runs, migrating 710 - * among the allowed CPUs in a round-robin fashion. 603 + * among the allowed CPUs in a round-robin fashion. The "per-cpu" mode 604 + * creates one hwlatd thread per allowed CPU. 711 605 */ 712 606 static ssize_t hwlat_mode_write(struct file *filp, const char __user *ubuf, 713 607 size_t cnt, loff_t *ppos) ··· 832 724 { 833 725 int err; 834 726 835 - err = start_kthread(tr); 727 + if (hwlat_data.thread_mode == MODE_PER_CPU) 728 + err = start_per_cpu_kthreads(tr); 729 + else 730 + err = start_single_kthread(tr); 836 731 if (err) 837 732 pr_err(BANNER "Cannot start hwlat kthread\n"); 838 733 } 839 734 840 735 static void hwlat_tracer_stop(struct trace_array *tr) 841 736 { 842 - stop_kthread(); 737 + if (hwlat_data.thread_mode == MODE_PER_CPU) 738 + stop_per_cpu_kthreads(); 739 + else 740 + stop_single_kthread(); 843 741 } 844 742 845 743 static int hwlat_tracer_init(struct trace_array *tr) ··· 874 760 875 761 static void hwlat_tracer_reset(struct trace_array *tr) 876 762 { 877 - stop_kthread(); 763 + hwlat_tracer_stop(tr); 878 764 879 765 /* the tracing threshold is static between runs */ 880 766 last_tracing_thresh = tracing_thresh;