Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

perf lock contention: Account contending locks too

Currently it accounts the contention using delta between timestamps in
lock:contention_begin and lock:contention_end tracepoints. But it means
the lock should see the both events during the monitoring period.

Actually there are 4 cases that happen with the monitoring:

monitoring period
/ \
| |
1: B------+-----------------------+--------E
2: B----+-------------E |
3: | B-----------+----E
4: | B-------------E |
| |
t0 t1

where B and E mean contention BEGIN and END, respectively. So it only
accounts the case 4 for now. It seems there's no way to handle the case
1. The case 2 might be handled if it saved the timestamp (t0), but it
lacks the information from the B notably the flags which shows the lock
types. Also it could be a nested lock which it currently ignores. So
I think we should ignore the case 2.

However we can handle the case 3 if we save the timestamp (t1) at the
end of the period. And then it can iterate the map entries in the
userspace and update the lock stat accordinly.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Ian Rogers <irogers@google.com>
Reviwed-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Song Liu <song@kernel.org>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20240228053335.312776-1-namhyung@kernel.org

+136 -7
+120
tools/perf/util/bpf_lock_contention.c
··· 179 179 return 0; 180 180 } 181 181 182 + /* 183 + * Run the BPF program directly using BPF_PROG_TEST_RUN to update the end 184 + * timestamp in ktime so that it can calculate delta easily. 185 + */ 186 + static void mark_end_timestamp(void) 187 + { 188 + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, 189 + .flags = BPF_F_TEST_RUN_ON_CPU, 190 + ); 191 + int prog_fd = bpf_program__fd(skel->progs.end_timestamp); 192 + 193 + bpf_prog_test_run_opts(prog_fd, &opts); 194 + } 195 + 196 + static void update_lock_stat(int map_fd, int pid, u64 end_ts, 197 + enum lock_aggr_mode aggr_mode, 198 + struct tstamp_data *ts_data) 199 + { 200 + u64 delta; 201 + struct contention_key stat_key = {}; 202 + struct contention_data stat_data; 203 + 204 + if (ts_data->timestamp >= end_ts) 205 + return; 206 + 207 + delta = end_ts - ts_data->timestamp; 208 + 209 + switch (aggr_mode) { 210 + case LOCK_AGGR_CALLER: 211 + stat_key.stack_id = ts_data->stack_id; 212 + break; 213 + case LOCK_AGGR_TASK: 214 + stat_key.pid = pid; 215 + break; 216 + case LOCK_AGGR_ADDR: 217 + stat_key.lock_addr_or_cgroup = ts_data->lock; 218 + break; 219 + case LOCK_AGGR_CGROUP: 220 + /* TODO */ 221 + return; 222 + default: 223 + return; 224 + } 225 + 226 + if (bpf_map_lookup_elem(map_fd, &stat_key, &stat_data) < 0) 227 + return; 228 + 229 + stat_data.total_time += delta; 230 + stat_data.count++; 231 + 232 + if (delta > stat_data.max_time) 233 + stat_data.max_time = delta; 234 + if (delta < stat_data.min_time) 235 + stat_data.min_time = delta; 236 + 237 + bpf_map_update_elem(map_fd, &stat_key, &stat_data, BPF_EXIST); 238 + } 239 + 240 + /* 241 + * Account entries in the tstamp map (which didn't see the corresponding 242 + * lock:contention_end tracepoint) using end_ts. 243 + */ 244 + static void account_end_timestamp(struct lock_contention *con) 245 + { 246 + int ts_fd, stat_fd; 247 + int *prev_key, key; 248 + u64 end_ts = skel->bss->end_ts; 249 + int total_cpus; 250 + enum lock_aggr_mode aggr_mode = con->aggr_mode; 251 + struct tstamp_data ts_data, *cpu_data; 252 + 253 + /* Iterate per-task tstamp map (key = TID) */ 254 + ts_fd = bpf_map__fd(skel->maps.tstamp); 255 + stat_fd = bpf_map__fd(skel->maps.lock_stat); 256 + 257 + prev_key = NULL; 258 + while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) { 259 + if (bpf_map_lookup_elem(ts_fd, &key, &ts_data) == 0) { 260 + int pid = key; 261 + 262 + if (aggr_mode == LOCK_AGGR_TASK && con->owner) 263 + pid = ts_data.flags; 264 + 265 + update_lock_stat(stat_fd, pid, end_ts, aggr_mode, 266 + &ts_data); 267 + } 268 + 269 + prev_key = &key; 270 + } 271 + 272 + /* Now it'll check per-cpu tstamp map which doesn't have TID. */ 273 + if (aggr_mode == LOCK_AGGR_TASK || aggr_mode == LOCK_AGGR_CGROUP) 274 + return; 275 + 276 + total_cpus = cpu__max_cpu().cpu; 277 + ts_fd = bpf_map__fd(skel->maps.tstamp_cpu); 278 + 279 + cpu_data = calloc(total_cpus, sizeof(*cpu_data)); 280 + if (cpu_data == NULL) 281 + return; 282 + 283 + prev_key = NULL; 284 + while (!bpf_map_get_next_key(ts_fd, prev_key, &key)) { 285 + if (bpf_map_lookup_elem(ts_fd, &key, cpu_data) < 0) 286 + goto next; 287 + 288 + for (int i = 0; i < total_cpus; i++) { 289 + update_lock_stat(stat_fd, -1, end_ts, aggr_mode, 290 + &cpu_data[i]); 291 + } 292 + 293 + next: 294 + prev_key = &key; 295 + } 296 + free(cpu_data); 297 + } 298 + 182 299 int lock_contention_start(void) 183 300 { 184 301 skel->bss->enabled = 1; ··· 305 188 int lock_contention_stop(void) 306 189 { 307 190 skel->bss->enabled = 0; 191 + mark_end_timestamp(); 308 192 return 0; 309 193 } 310 194 ··· 418 300 stack_trace = zalloc(stack_size); 419 301 if (stack_trace == NULL) 420 302 return -1; 303 + 304 + account_end_timestamp(con); 421 305 422 306 if (con->aggr_mode == LOCK_AGGR_TASK) { 423 307 struct thread *idle = __machine__findnew_thread(machine,
+9 -7
tools/perf/util/bpf_skel/lock_contention.bpf.c
··· 19 19 #define LCB_F_PERCPU (1U << 4) 20 20 #define LCB_F_MUTEX (1U << 5) 21 21 22 - struct tstamp_data { 23 - __u64 timestamp; 24 - __u64 lock; 25 - __u32 flags; 26 - __s32 stack_id; 27 - }; 28 - 29 22 /* callstack storage */ 30 23 struct { 31 24 __uint(type, BPF_MAP_TYPE_STACK_TRACE); ··· 132 139 133 140 /* determine the key of lock stat */ 134 141 int aggr_mode; 142 + 143 + __u64 end_ts; 135 144 136 145 /* error stat */ 137 146 int task_fail; ··· 551 556 lock_flag = LOCK_CLASS_RQLOCK; 552 557 bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); 553 558 } 559 + return 0; 560 + } 561 + 562 + SEC("raw_tp/bpf_test_finish") 563 + int BPF_PROG(end_timestamp) 564 + { 565 + end_ts = bpf_ktime_get_ns(); 554 566 return 0; 555 567 } 556 568
+7
tools/perf/util/bpf_skel/lock_data.h
··· 3 3 #ifndef UTIL_BPF_SKEL_LOCK_DATA_H 4 4 #define UTIL_BPF_SKEL_LOCK_DATA_H 5 5 6 + struct tstamp_data { 7 + u64 timestamp; 8 + u64 lock; 9 + u32 flags; 10 + u32 stack_id; 11 + }; 12 + 6 13 struct contention_key { 7 14 u32 stack_id; 8 15 u32 pid;