Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'bpf-stackmap-nmi'

Song Liu says:
====================
Changes v2 -> v3:
Improve syntax based on suggestion by Tobin C. Harding.

Changes v1 -> v2:
1. Rename some variables to (hopefully) reduce confusion;
2. Check irq_work status with IRQ_WORK_BUSY (instead of work->sem);
3. In Kconfig, let BPF_SYSCALL select IRQ_WORK;
4. Add static to DEFINE_PER_CPU();
5. Remove pr_info() in stack_map_init().
====================

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>

+196 -8
+1
init/Kconfig
··· 1391 1391 bool "Enable bpf() system call" 1392 1392 select ANON_INODES 1393 1393 select BPF 1394 + select IRQ_WORK 1394 1395 default n 1395 1396 help 1396 1397 Enable the bpf() system call that allows to manipulate eBPF
+53 -6
kernel/bpf/stackmap.c
··· 11 11 #include <linux/perf_event.h> 12 12 #include <linux/elf.h> 13 13 #include <linux/pagemap.h> 14 + #include <linux/irq_work.h> 14 15 #include "percpu_freelist.h" 15 16 16 17 #define STACK_CREATE_FLAG_MASK \ ··· 32 31 u32 n_buckets; 33 32 struct stack_map_bucket *buckets[]; 34 33 }; 34 + 35 + /* irq_work to run up_read() for build_id lookup in nmi context */ 36 + struct stack_map_irq_work { 37 + struct irq_work irq_work; 38 + struct rw_semaphore *sem; 39 + }; 40 + 41 + static void do_up_read(struct irq_work *entry) 42 + { 43 + struct stack_map_irq_work *work; 44 + 45 + work = container_of(entry, struct stack_map_irq_work, irq_work); 46 + up_read(work->sem); 47 + work->sem = NULL; 48 + } 49 + 50 + static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); 35 51 36 52 static inline bool stack_map_use_build_id(struct bpf_map *map) 37 53 { ··· 285 267 { 286 268 int i; 287 269 struct vm_area_struct *vma; 270 + bool in_nmi_ctx = in_nmi(); 271 + bool irq_work_busy = false; 272 + struct stack_map_irq_work *work; 273 + 274 + if (in_nmi_ctx) { 275 + work = this_cpu_ptr(&up_read_work); 276 + if (work->irq_work.flags & IRQ_WORK_BUSY) 277 + /* cannot queue more up_read, fallback */ 278 + irq_work_busy = true; 279 + } 288 280 289 281 /* 290 - * We cannot do up_read() in nmi context, so build_id lookup is 291 - * only supported for non-nmi events. If at some point, it is 292 - * possible to run find_vma() without taking the semaphore, we 293 - * would like to allow build_id lookup in nmi context. 282 + * We cannot do up_read() in nmi context. To do build_id lookup 283 + * in nmi context, we need to run up_read() in irq_work. We use 284 + * a percpu variable to do the irq_work. If the irq_work is 285 + * already used by another lookup, we fall back to report ips. 294 286 * 295 287 * Same fallback is used for kernel stack (!user) on a stackmap 296 288 * with build_id. 297 289 */ 298 - if (!user || !current || !current->mm || in_nmi() || 290 + if (!user || !current || !current->mm || irq_work_busy || 299 291 down_read_trylock(&current->mm->mmap_sem) == 0) { 300 292 /* cannot access current->mm, fall back to ips */ 301 293 for (i = 0; i < trace_nr; i++) { ··· 327 299 - vma->vm_start; 328 300 id_offs[i].status = BPF_STACK_BUILD_ID_VALID; 329 301 } 330 - up_read(&current->mm->mmap_sem); 302 + 303 + if (!in_nmi_ctx) { 304 + up_read(&current->mm->mmap_sem); 305 + } else { 306 + work->sem = &current->mm->mmap_sem; 307 + irq_work_queue(&work->irq_work); 308 + } 331 309 } 332 310 333 311 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, ··· 609 575 .map_update_elem = stack_map_update_elem, 610 576 .map_delete_elem = stack_map_delete_elem, 611 577 }; 578 + 579 + static int __init stack_map_init(void) 580 + { 581 + int cpu; 582 + struct stack_map_irq_work *work; 583 + 584 + for_each_possible_cpu(cpu) { 585 + work = per_cpu_ptr(&up_read_work, cpu); 586 + init_irq_work(&work->irq_work, do_up_read); 587 + } 588 + return 0; 589 + } 590 + subsys_initcall(stack_map_init);
+134
tools/testing/selftests/bpf/test_progs.c
··· 1272 1272 return; 1273 1273 } 1274 1274 1275 + static void test_stacktrace_build_id_nmi(void) 1276 + { 1277 + int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd; 1278 + const char *file = "./test_stacktrace_build_id.o"; 1279 + int err, pmu_fd, prog_fd; 1280 + struct perf_event_attr attr = { 1281 + .sample_freq = 5000, 1282 + .freq = 1, 1283 + .type = PERF_TYPE_HARDWARE, 1284 + .config = PERF_COUNT_HW_CPU_CYCLES, 1285 + }; 1286 + __u32 key, previous_key, val, duration = 0; 1287 + struct bpf_object *obj; 1288 + char buf[256]; 1289 + int i, j; 1290 + struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH]; 1291 + int build_id_matches = 0; 1292 + 1293 + err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd); 1294 + if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno)) 1295 + return; 1296 + 1297 + pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 1298 + 0 /* cpu 0 */, -1 /* group id */, 1299 + 0 /* flags */); 1300 + if (CHECK(pmu_fd < 0, "perf_event_open", 1301 + "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n", 1302 + pmu_fd, errno)) 1303 + goto close_prog; 1304 + 1305 + err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); 1306 + if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", 1307 + err, errno)) 1308 + goto close_pmu; 1309 + 1310 + err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); 1311 + if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", 1312 + err, errno)) 1313 + goto disable_pmu; 1314 + 1315 + /* find map fds */ 1316 + control_map_fd = bpf_find_map(__func__, obj, "control_map"); 1317 + if (CHECK(control_map_fd < 0, "bpf_find_map control_map", 1318 + "err %d errno %d\n", err, errno)) 1319 + goto disable_pmu; 1320 + 1321 + stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap"); 1322 + if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap", 1323 + "err %d errno %d\n", err, errno)) 1324 + goto disable_pmu; 1325 + 1326 + stackmap_fd = bpf_find_map(__func__, obj, "stackmap"); 1327 + if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n", 1328 + err, errno)) 1329 + goto disable_pmu; 1330 + 1331 + stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap"); 1332 + if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap", 1333 + "err %d errno %d\n", err, errno)) 1334 + goto disable_pmu; 1335 + 1336 + assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null") 1337 + == 0); 1338 + assert(system("taskset 0x1 ./urandom_read 100000") == 0); 1339 + /* disable stack trace collection */ 1340 + key = 0; 1341 + val = 1; 1342 + bpf_map_update_elem(control_map_fd, &key, &val, 0); 1343 + 1344 + /* for every element in stackid_hmap, we can find a corresponding one 1345 + * in stackmap, and vise versa. 1346 + */ 1347 + err = compare_map_keys(stackid_hmap_fd, stackmap_fd); 1348 + if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", 1349 + "err %d errno %d\n", err, errno)) 1350 + goto disable_pmu; 1351 + 1352 + err = compare_map_keys(stackmap_fd, stackid_hmap_fd); 1353 + if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap", 1354 + "err %d errno %d\n", err, errno)) 1355 + goto disable_pmu; 1356 + 1357 + err = extract_build_id(buf, 256); 1358 + 1359 + if (CHECK(err, "get build_id with readelf", 1360 + "err %d errno %d\n", err, errno)) 1361 + goto disable_pmu; 1362 + 1363 + err = bpf_map_get_next_key(stackmap_fd, NULL, &key); 1364 + if (CHECK(err, "get_next_key from stackmap", 1365 + "err %d, errno %d\n", err, errno)) 1366 + goto disable_pmu; 1367 + 1368 + do { 1369 + char build_id[64]; 1370 + 1371 + err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs); 1372 + if (CHECK(err, "lookup_elem from stackmap", 1373 + "err %d, errno %d\n", err, errno)) 1374 + goto disable_pmu; 1375 + for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i) 1376 + if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID && 1377 + id_offs[i].offset != 0) { 1378 + for (j = 0; j < 20; ++j) 1379 + sprintf(build_id + 2 * j, "%02x", 1380 + id_offs[i].build_id[j] & 0xff); 1381 + if (strstr(buf, build_id) != NULL) 1382 + build_id_matches = 1; 1383 + } 1384 + previous_key = key; 1385 + } while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0); 1386 + 1387 + if (CHECK(build_id_matches < 1, "build id match", 1388 + "Didn't find expected build ID from the map\n")) 1389 + goto disable_pmu; 1390 + 1391 + /* 1392 + * We intentionally skip compare_stack_ips(). This is because we 1393 + * only support one in_nmi() ips-to-build_id translation per cpu 1394 + * at any time, thus stack_amap here will always fallback to 1395 + * BPF_STACK_BUILD_ID_IP; 1396 + */ 1397 + 1398 + disable_pmu: 1399 + ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE); 1400 + 1401 + close_pmu: 1402 + close(pmu_fd); 1403 + 1404 + close_prog: 1405 + bpf_object__close(obj); 1406 + } 1407 + 1275 1408 #define MAX_CNT_RAWTP 10ull 1276 1409 #define MAX_STACK_RAWTP 100 1277 1410 struct get_stack_trace_t { ··· 1558 1425 test_tp_attach_query(); 1559 1426 test_stacktrace_map(); 1560 1427 test_stacktrace_build_id(); 1428 + test_stacktrace_build_id_nmi(); 1561 1429 test_stacktrace_map_raw_tp(); 1562 1430 test_get_stack_raw_tp(); 1563 1431
+8 -2
tools/testing/selftests/bpf/urandom_read.c
··· 6 6 #include <stdlib.h> 7 7 8 8 #define BUF_SIZE 256 9 - int main(void) 9 + 10 + int main(int argc, char *argv[]) 10 11 { 11 12 int fd = open("/dev/urandom", O_RDONLY); 12 13 int i; 13 14 char buf[BUF_SIZE]; 15 + int count = 4; 14 16 15 17 if (fd < 0) 16 18 return 1; 17 - for (i = 0; i < 4; ++i) 19 + 20 + if (argc == 2) 21 + count = atoi(argv[1]); 22 + 23 + for (i = 0; i < count; ++i) 18 24 read(fd, buf, BUF_SIZE); 19 25 20 26 close(fd);