Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

nds32: Add perf call-graph support.

The perf call-graph option can trace the callchain
between functions. This commit add the perf callchain
for nds32. There are kerenl callchain and user callchain.
The kerenl callchain can trace the function in kernel
space. There are two type for user callchain. One for the
'optimize for size' config is set, and another one for the
config is not set. The difference between two types is that
the index of frame-pointer in user stack is not the same.

For example:
With optimize for size:
User Stack:
---------
| lp |
---------
| gp |
---------
| fp |

Without optimize for size:
User Stack:
1. non-leaf function:
---------
| lp |
---------
| fp |

2. leaf function:
---------
| fp |

Signed-off-by: Nickhu <nickhu@andestech.com>
Acked-by: Greentime Hu <greentime@andestech.com>
Signed-off-by: Greentime Hu <greentime@andestech.com>

authored by

Nickhu and committed by
Greentime Hu
c8b34461 ebd09753

+299
+299
arch/nds32/kernel/perf_event_cpu.c
··· 1193 1193 1194 1194 device_initcall(register_pmu_driver); 1195 1195 1196 + /* 1197 + * References: arch/nds32/kernel/traps.c:__dump() 1198 + * You will need to know the NDS ABI first. 1199 + */ 1200 + static int unwind_frame_kernel(struct stackframe *frame) 1201 + { 1202 + int graph = 0; 1203 + #ifdef CONFIG_FRAME_POINTER 1204 + /* 0x3 means misalignment */ 1205 + if (!kstack_end((void *)frame->fp) && 1206 + !((unsigned long)frame->fp & 0x3) && 1207 + ((unsigned long)frame->fp >= TASK_SIZE)) { 1208 + /* 1209 + * The array index is based on the ABI, the below graph 1210 + * illustrate the reasons. 1211 + * Function call procedure: "smw" and "lmw" will always 1212 + * update SP and FP for you automatically. 1213 + * 1214 + * Stack Relative Address 1215 + * | | 0 1216 + * ---- 1217 + * |LP| <-- SP(before smw) <-- FP(after smw) -1 1218 + * ---- 1219 + * |FP| -2 1220 + * ---- 1221 + * | | <-- SP(after smw) -3 1222 + */ 1223 + frame->lp = ((unsigned long *)frame->fp)[-1]; 1224 + frame->fp = ((unsigned long *)frame->fp)[FP_OFFSET]; 1225 + /* make sure CONFIG_FUNCTION_GRAPH_TRACER is turned on */ 1226 + if (__kernel_text_address(frame->lp)) 1227 + frame->lp = ftrace_graph_ret_addr 1228 + (NULL, &graph, frame->lp, NULL); 1229 + 1230 + return 0; 1231 + } else { 1232 + return -EPERM; 1233 + } 1234 + #else 1235 + /* 1236 + * You can refer to arch/nds32/kernel/traps.c:__dump() 1237 + * Treat "sp" as "fp", but the "sp" is one frame ahead of "fp". 1238 + * And, the "sp" is not always correct. 1239 + * 1240 + * Stack Relative Address 1241 + * | | 0 1242 + * ---- 1243 + * |LP| <-- SP(before smw) -1 1244 + * ---- 1245 + * | | <-- SP(after smw) -2 1246 + * ---- 1247 + */ 1248 + if (!kstack_end((void *)frame->sp)) { 1249 + frame->lp = ((unsigned long *)frame->sp)[1]; 1250 + /* TODO: How to deal with the value in first 1251 + * "sp" is not correct? 1252 + */ 1253 + if (__kernel_text_address(frame->lp)) 1254 + frame->lp = ftrace_graph_ret_addr 1255 + (tsk, &graph, frame->lp, NULL); 1256 + 1257 + frame->sp = ((unsigned long *)frame->sp) + 1; 1258 + 1259 + return 0; 1260 + } else { 1261 + return -EPERM; 1262 + } 1263 + #endif 1264 + } 1265 + 1266 + static void notrace 1267 + walk_stackframe(struct stackframe *frame, 1268 + int (*fn_record)(struct stackframe *, void *), 1269 + void *data) 1270 + { 1271 + while (1) { 1272 + int ret; 1273 + 1274 + if (fn_record(frame, data)) 1275 + break; 1276 + 1277 + ret = unwind_frame_kernel(frame); 1278 + if (ret < 0) 1279 + break; 1280 + } 1281 + } 1282 + 1283 + /* 1284 + * Gets called by walk_stackframe() for every stackframe. This will be called 1285 + * whist unwinding the stackframe and is like a subroutine return so we use 1286 + * the PC. 1287 + */ 1288 + static int callchain_trace(struct stackframe *fr, void *data) 1289 + { 1290 + struct perf_callchain_entry_ctx *entry = data; 1291 + 1292 + perf_callchain_store(entry, fr->lp); 1293 + return 0; 1294 + } 1295 + 1296 + /* 1297 + * Get the return address for a single stackframe and return a pointer to the 1298 + * next frame tail. 1299 + */ 1300 + static unsigned long 1301 + user_backtrace(struct perf_callchain_entry_ctx *entry, unsigned long fp) 1302 + { 1303 + struct frame_tail buftail; 1304 + unsigned long lp = 0; 1305 + unsigned long *user_frame_tail = 1306 + (unsigned long *)(fp - (unsigned long)sizeof(buftail)); 1307 + 1308 + /* Check accessibility of one struct frame_tail beyond */ 1309 + if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(buftail))) 1310 + return 0; 1311 + if (__copy_from_user_inatomic 1312 + (&buftail, user_frame_tail, sizeof(buftail))) 1313 + return 0; 1314 + 1315 + /* 1316 + * Refer to unwind_frame_kernel() for more illurstration 1317 + */ 1318 + lp = buftail.stack_lp; /* ((unsigned long *)fp)[-1] */ 1319 + fp = buftail.stack_fp; /* ((unsigned long *)fp)[FP_OFFSET] */ 1320 + perf_callchain_store(entry, lp); 1321 + return fp; 1322 + } 1323 + 1324 + static unsigned long 1325 + user_backtrace_opt_size(struct perf_callchain_entry_ctx *entry, 1326 + unsigned long fp) 1327 + { 1328 + struct frame_tail_opt_size buftail; 1329 + unsigned long lp = 0; 1330 + 1331 + unsigned long *user_frame_tail = 1332 + (unsigned long *)(fp - (unsigned long)sizeof(buftail)); 1333 + 1334 + /* Check accessibility of one struct frame_tail beyond */ 1335 + if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(buftail))) 1336 + return 0; 1337 + if (__copy_from_user_inatomic 1338 + (&buftail, user_frame_tail, sizeof(buftail))) 1339 + return 0; 1340 + 1341 + /* 1342 + * Refer to unwind_frame_kernel() for more illurstration 1343 + */ 1344 + lp = buftail.stack_lp; /* ((unsigned long *)fp)[-1] */ 1345 + fp = buftail.stack_fp; /* ((unsigned long *)fp)[FP_OFFSET] */ 1346 + 1347 + perf_callchain_store(entry, lp); 1348 + return fp; 1349 + } 1350 + 1351 + /* 1352 + * This will be called when the target is in user mode 1353 + * This function will only be called when we use 1354 + * "PERF_SAMPLE_CALLCHAIN" in 1355 + * kernel/events/core.c:perf_prepare_sample() 1356 + * 1357 + * How to trigger perf_callchain_[user/kernel] : 1358 + * $ perf record -e cpu-clock --call-graph fp ./program 1359 + * $ perf report --call-graph 1360 + */ 1361 + unsigned long leaf_fp; 1362 + void 1363 + perf_callchain_user(struct perf_callchain_entry_ctx *entry, 1364 + struct pt_regs *regs) 1365 + { 1366 + unsigned long fp = 0; 1367 + unsigned long gp = 0; 1368 + unsigned long lp = 0; 1369 + unsigned long sp = 0; 1370 + unsigned long *user_frame_tail; 1371 + 1372 + leaf_fp = 0; 1373 + 1374 + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { 1375 + /* We don't support guest os callchain now */ 1376 + return; 1377 + } 1378 + 1379 + perf_callchain_store(entry, regs->ipc); 1380 + fp = regs->fp; 1381 + gp = regs->gp; 1382 + lp = regs->lp; 1383 + sp = regs->sp; 1384 + if (entry->nr < PERF_MAX_STACK_DEPTH && 1385 + (unsigned long)fp && !((unsigned long)fp & 0x7) && fp > sp) { 1386 + user_frame_tail = 1387 + (unsigned long *)(fp - (unsigned long)sizeof(fp)); 1388 + 1389 + if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(fp))) 1390 + return; 1391 + 1392 + if (__copy_from_user_inatomic 1393 + (&leaf_fp, user_frame_tail, sizeof(fp))) 1394 + return; 1395 + 1396 + if (leaf_fp == lp) { 1397 + /* 1398 + * Maybe this is non leaf function 1399 + * with optimize for size, 1400 + * or maybe this is the function 1401 + * with optimize for size 1402 + */ 1403 + struct frame_tail buftail; 1404 + 1405 + user_frame_tail = 1406 + (unsigned long *)(fp - 1407 + (unsigned long)sizeof(buftail)); 1408 + 1409 + if (!access_ok 1410 + (VERIFY_READ, user_frame_tail, sizeof(buftail))) 1411 + return; 1412 + 1413 + if (__copy_from_user_inatomic 1414 + (&buftail, user_frame_tail, sizeof(buftail))) 1415 + return; 1416 + 1417 + if (buftail.stack_fp == gp) { 1418 + /* non leaf function with optimize 1419 + * for size condition 1420 + */ 1421 + struct frame_tail_opt_size buftail_opt_size; 1422 + 1423 + user_frame_tail = 1424 + (unsigned long *)(fp - (unsigned long) 1425 + sizeof(buftail_opt_size)); 1426 + 1427 + if (!access_ok(VERIFY_READ, user_frame_tail, 1428 + sizeof(buftail_opt_size))) 1429 + return; 1430 + 1431 + if (__copy_from_user_inatomic 1432 + (&buftail_opt_size, user_frame_tail, 1433 + sizeof(buftail_opt_size))) 1434 + return; 1435 + 1436 + perf_callchain_store(entry, lp); 1437 + fp = buftail_opt_size.stack_fp; 1438 + 1439 + while ((entry->nr < PERF_MAX_STACK_DEPTH) && 1440 + (unsigned long)fp && 1441 + !((unsigned long)fp & 0x7) && 1442 + fp > sp) { 1443 + sp = fp; 1444 + fp = user_backtrace_opt_size(entry, fp); 1445 + } 1446 + 1447 + } else { 1448 + /* this is the function 1449 + * without optimize for size 1450 + */ 1451 + fp = buftail.stack_fp; 1452 + perf_callchain_store(entry, lp); 1453 + while ((entry->nr < PERF_MAX_STACK_DEPTH) && 1454 + (unsigned long)fp && 1455 + !((unsigned long)fp & 0x7) && 1456 + fp > sp) { 1457 + sp = fp; 1458 + fp = user_backtrace(entry, fp); 1459 + } 1460 + } 1461 + } else { 1462 + /* this is leaf function */ 1463 + fp = leaf_fp; 1464 + perf_callchain_store(entry, lp); 1465 + 1466 + /* previous function callcahin */ 1467 + while ((entry->nr < PERF_MAX_STACK_DEPTH) && 1468 + (unsigned long)fp && 1469 + !((unsigned long)fp & 0x7) && fp > sp) { 1470 + sp = fp; 1471 + fp = user_backtrace(entry, fp); 1472 + } 1473 + } 1474 + return; 1475 + } 1476 + } 1477 + 1478 + /* This will be called when the target is in kernel mode */ 1479 + void 1480 + perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, 1481 + struct pt_regs *regs) 1482 + { 1483 + struct stackframe fr; 1484 + 1485 + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { 1486 + /* We don't support guest os callchain now */ 1487 + return; 1488 + } 1489 + fr.fp = regs->fp; 1490 + fr.lp = regs->lp; 1491 + fr.sp = regs->sp; 1492 + walk_stackframe(&fr, callchain_trace, entry); 1493 + } 1494 + 1196 1495 unsigned long perf_instruction_pointer(struct pt_regs *regs) 1197 1496 { 1198 1497 /* However, NDS32 does not support virtualization */