Merge tag 'mm-nonmm-stable-2025-10-02-15-29' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

+2 -1

.mailmap

··· 721 721 Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com> 722 722 Shuah Khan <shuah@kernel.org> <shuahkh@osg.samsung.com> 723 723 Shuah Khan <shuah@kernel.org> <shuah.kh@samsung.com> 724 - Sibi Sankar <quic_sibis@quicinc.com> <sibis@codeaurora.org> 724 + Sibi Sankar <sibi.sankar@oss.qualcomm.com> <sibis@codeaurora.org> 725 + Sibi Sankar <sibi.sankar@oss.qualcomm.com> <quic_sibis@quicinc.com> 725 726 Sid Manning <quic_sidneym@quicinc.com> <sidneym@codeaurora.org> 726 727 Simon Arlott <simon@octiron.net> <simon@fire.lp0.eu> 727 728 Simona Vetter <simona.vetter@ffwll.ch> <daniel.vetter@ffwll.ch>

+57 -32

Documentation/accounting/delay-accounting.rst

··· 134 134 135 135 After the system starts, use `delaytop` to get the system-wide delay information, 136 136 which includes system-wide PSI information and Top-N high-latency tasks. 137 + Note: PSI support requires `CONFIG_PSI=y` and `psi=1` for full functionality. 137 138 138 - `delaytop` supports sorting by CPU latency in descending order by default, 139 - displays the top 20 high-latency tasks by default, and refreshes the latency 140 - data every 2 seconds by default. 139 + `delaytop` is an interactive tool for monitoring system pressure and task delays. 140 + It supports multiple sorting options, display modes, and real-time keyboard controls. 141 141 142 - Get PSI information and Top-N tasks delay, since system boot:: 142 + Basic usage with default settings (sorts by CPU delay, shows top 20 tasks, refreshes every 2 seconds):: 143 143 144 144 bash# ./delaytop 145 - System Pressure Information: (avg10/avg60/avg300/total) 146 - CPU some: 0.0%/ 0.0%/ 0.0%/ 345(ms) 145 + System Pressure Information: (avg10/avg60vg300/total) 146 + CPU some: 0.0%/ 0.0%/ 0.0%/ 106137(ms) 147 147 CPU full: 0.0%/ 0.0%/ 0.0%/ 0(ms) 148 148 Memory full: 0.0%/ 0.0%/ 0.0%/ 0(ms) 149 149 Memory some: 0.0%/ 0.0%/ 0.0%/ 0(ms) 150 - IO full: 0.0%/ 0.0%/ 0.0%/ 65(ms) 151 - IO some: 0.0%/ 0.0%/ 0.0%/ 79(ms) 150 + IO full: 0.0%/ 0.0%/ 0.0%/ 2240(ms) 151 + IO some: 0.0%/ 0.0%/ 0.0%/ 2783(ms) 152 152 IRQ full: 0.0%/ 0.0%/ 0.0%/ 0(ms) 153 - Top 20 processes (sorted by CPU delay): 154 - PID TGID COMMAND CPU(ms) IO(ms) SWAP(ms) RCL(ms) THR(ms) CMP(ms) WP(ms) IRQ(ms) 155 - ---------------------------------------------------------------------------------------------- 156 - 161 161 zombie_memcg_re 1.40 0.00 0.00 0.00 0.00 0.00 0.00 0.00 157 - 130 130 blkcg_punt_bio 1.37 0.00 0.00 0.00 0.00 0.00 0.00 0.00 158 - 444 444 scsi_tmf_0 0.73 0.00 0.00 0.00 0.00 0.00 0.00 0.00 159 - 1280 1280 rsyslogd 0.53 0.04 0.00 0.00 0.00 0.00 0.00 0.00 160 - 12 12 ksoftirqd/0 0.47 0.00 0.00 0.00 0.00 0.00 0.00 0.00 161 - 1277 1277 nbd-server 0.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 162 - 308 308 kworker/2:2-sys 0.41 0.00 0.00 0.00 0.00 0.00 0.00 0.00 163 - 55 55 netns 0.36 0.00 0.00 0.00 0.00 0.00 0.00 0.00 164 - 1187 1187 acpid 0.31 0.03 0.00 0.00 0.00 0.00 0.00 0.00 165 - 6184 6184 kworker/1:2-sys 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 166 - 186 186 kaluad 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 167 - 18 18 ksoftirqd/1 0.24 0.00 0.00 0.00 0.00 0.00 0.00 0.00 168 - 185 185 kmpath_rdacd 0.23 0.00 0.00 0.00 0.00 0.00 0.00 0.00 169 - 190 190 kstrp 0.23 0.00 0.00 0.00 0.00 0.00 0.00 0.00 170 - 2759 2759 agetty 0.20 0.03 0.00 0.00 0.00 0.00 0.00 0.00 171 - 1190 1190 kworker/0:3-sys 0.19 0.00 0.00 0.00 0.00 0.00 0.00 0.00 172 - 1272 1272 sshd 0.15 0.04 0.00 0.00 0.00 0.00 0.00 0.00 173 - 1156 1156 license 0.15 0.11 0.00 0.00 0.00 0.00 0.00 0.00 174 - 134 134 md 0.13 0.00 0.00 0.00 0.00 0.00 0.00 0.00 175 - 6142 6142 kworker/3:2-xfs 0.13 0.00 0.00 0.00 0.00 0.00 0.00 0.00 153 + [o]sort [M]memverbose [q]quit 154 + Top 20 processes (sorted by cpu delay): 155 + PID TGID COMMAND CPU(ms) IO(ms) IRQ(ms) MEM(ms) 156 + ------------------------------------------------------------------------ 157 + 110 110 kworker/15:0H-s 27.91 0.00 0.00 0.00 158 + 57 57 cpuhp/7 3.18 0.00 0.00 0.00 159 + 99 99 cpuhp/14 2.97 0.00 0.00 0.00 160 + 51 51 cpuhp/6 0.90 0.00 0.00 0.00 161 + 44 44 kworker/4:0H-sy 0.80 0.00 0.00 0.00 162 + 60 60 ksoftirqd/7 0.74 0.00 0.00 0.00 163 + 76 76 idle_inject/10 0.31 0.00 0.00 0.00 164 + 100 100 idle_inject/14 0.30 0.00 0.00 0.00 165 + 1309 1309 systemsettings 0.29 0.00 0.00 0.00 166 + 45 45 cpuhp/5 0.22 0.00 0.00 0.00 167 + 63 63 cpuhp/8 0.20 0.00 0.00 0.00 168 + 87 87 cpuhp/12 0.18 0.00 0.00 0.00 169 + 93 93 cpuhp/13 0.17 0.00 0.00 0.00 170 + 1265 1265 acpid 0.17 0.00 0.00 0.00 171 + 1552 1552 sshd 0.17 0.00 0.00 0.00 172 + 2584 2584 sddm-helper 0.16 0.00 0.00 0.00 173 + 1284 1284 rtkit-daemon 0.15 0.00 0.00 0.00 174 + 1326 1326 nde-netfilter 0.14 0.00 0.00 0.00 175 + 27 27 cpuhp/2 0.13 0.00 0.00 0.00 176 + 631 631 kworker/11:2-rc 0.11 0.00 0.00 0.00 176 177 177 - Dynamic interactive interface of delaytop:: 178 + Interactive keyboard controls during runtime:: 179 + 180 + o - Select sort field (CPU, IO, IRQ, Memory, etc.) 181 + M - Toggle display mode (Default/Memory Verbose) 182 + q - Quit 183 + 184 + Available sort fields(use -s/--sort or interactive command):: 185 + 186 + cpu(c) - CPU delay 187 + blkio(i) - I/O delay 188 + irq(q) - IRQ delay 189 + mem(m) - Total memory delay 190 + swapin(s) - Swapin delay (memory verbose mode only) 191 + freepages(r) - Freepages reclaim delay (memory verbose mode only) 192 + thrashing(t) - Thrashing delay (memory verbose mode only) 193 + compact(p) - Compaction delay (memory verbose mode only) 194 + wpcopy(w) - Write page copy delay (memory verbose mode only) 195 + 196 + Advanced usage examples:: 197 + 198 + # ./delaytop -s blkio 199 + Sorted by IO delay 200 + 201 + # ./delaytop -s mem -M 202 + Sorted by memory delay in memory verbose mode 178 203 179 204 # ./delaytop -p pid 180 205 Print delayacct stats

+1 -1

Documentation/admin-guide/kernel-parameters.txt

··· 4603 4603 bit 2: print timer info 4604 4604 bit 3: print locks info if CONFIG_LOCKDEP is on 4605 4605 bit 4: print ftrace buffer 4606 - bit 5: replay all messages on consoles at the end of panic 4606 + bit 5: replay all kernel messages on consoles at the end of panic 4607 4607 bit 6: print all CPUs backtrace (if available in the arch) 4608 4608 bit 7: print only tasks in uninterruptible (blocked) state 4609 4609 *Be aware* that this option may print a _lot_ of lines,

+1 -1

Documentation/admin-guide/sysctl/kernel.rst

··· 890 890 bit 2 print timer info 891 891 bit 3 print locks info if ``CONFIG_LOCKDEP`` is on 892 892 bit 4 print ftrace buffer 893 - bit 5 replay all messages on consoles at the end of panic 893 + bit 5 replay all kernel messages on consoles at the end of panic 894 894 bit 6 print all CPUs backtrace (if available in the arch) 895 895 bit 7 print only tasks in uninterruptible (blocked) state 896 896 ===== ============================================

+6 -1

Documentation/dev-tools/kcov.rst

··· 361 361 */ 362 362 sleep(2); 363 363 364 - n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); 364 + /* 365 + * The load to the coverage count should be an acquire to pair with 366 + * pair with the corresponding write memory barrier (smp_wmb()) on 367 + * the kernel-side in kcov_move_area(). 368 + */ 369 + n = __atomic_load_n(&cover[0], __ATOMIC_ACQUIRE); 365 370 for (i = 0; i < n; i++) 366 371 printf("0x%lx\n", cover[i + 1]); 367 372 if (ioctl(fd, KCOV_DISABLE, 0))

+1 -1

MAINTAINERS

··· 20987 20987 F: drivers/pmdomain/qcom/cpr.c 20988 20988 20989 20989 QUALCOMM CPUCP MAILBOX DRIVER 20990 - M: Sibi Sankar <quic_sibis@quicinc.com> 20990 + M: Sibi Sankar <sibi.sankar@oss.qualcomm.com> 20991 20991 L: linux-arm-msm@vger.kernel.org 20992 20992 S: Supported 20993 20993 F: Documentation/devicetree/bindings/mailbox/qcom,cpucp-mbox.yaml

+19 -6

arch/x86/kernel/crash.c

··· 165 165 /* 166 166 * Exclusion of crash region, crashk_low_res and/or crashk_cma_ranges 167 167 * may cause range splits. So add extra slots here. 168 + * 169 + * Exclusion of low 1M may not cause another range split, because the 170 + * range of exclude is [0, 1M] and the condition for splitting a new 171 + * region is that the start, end parameters are both in a certain 172 + * existing region in cmem and cannot be equal to existing region's 173 + * start or end. Obviously, the start of [0, 1M] cannot meet this 174 + * condition. 175 + * 176 + * But in order to lest the low 1M could be changed in the future, 177 + * (e.g. [start, 1M]), add a extra slot. 168 178 */ 169 - nr_ranges += 2 + crashk_cma_cnt; 179 + nr_ranges += 3 + crashk_cma_cnt; 170 180 cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); 171 181 if (!cmem) 172 182 return NULL; 173 183 174 184 cmem->max_nr_ranges = nr_ranges; 175 - cmem->nr_ranges = 0; 176 185 177 186 return cmem; 178 187 } ··· 332 323 struct crash_mem *cmem; 333 324 334 325 /* 335 - * Using random kexec_buf for passing dm crypt keys may cause a range 336 - * split. So use two slots here. 326 + * In the current x86 architecture code, the elfheader is always 327 + * allocated at crashk_res.start. But it depends on the allocation 328 + * position of elfheader in crashk_res. To avoid potential out of 329 + * bounds in future, add an extra slot. 330 + * 331 + * And using random kexec_buf for passing dm crypt keys may cause a 332 + * range split too, add another extra slot here. 337 333 */ 338 - nr_ranges = 2; 334 + nr_ranges = 3; 339 335 cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); 340 336 if (!cmem) 341 337 return -ENOMEM; 342 338 343 339 cmem->max_nr_ranges = nr_ranges; 344 - cmem->nr_ranges = 0; 345 340 346 341 memset(&cmd, 0, sizeof(struct crash_memmap_data)); 347 342 cmd.params = params;

+44 -3

arch/x86/kernel/kexec-bzimage64.c

··· 16 16 #include <linux/kexec.h> 17 17 #include <linux/kernel.h> 18 18 #include <linux/mm.h> 19 + #include <linux/libfdt.h> 20 + #include <linux/of_fdt.h> 19 21 #include <linux/efi.h> 20 22 #include <linux/random.h> 21 23 ··· 214 212 } 215 213 #endif /* CONFIG_EFI */ 216 214 215 + #ifdef CONFIG_OF_FLATTREE 216 + static void setup_dtb(struct boot_params *params, 217 + unsigned long params_load_addr, 218 + unsigned int dtb_setup_data_offset) 219 + { 220 + struct setup_data *sd = (void *)params + dtb_setup_data_offset; 221 + unsigned long setup_data_phys, dtb_len; 222 + 223 + dtb_len = fdt_totalsize(initial_boot_params); 224 + sd->type = SETUP_DTB; 225 + sd->len = dtb_len; 226 + 227 + /* Carry over current boot DTB with setup_data */ 228 + memcpy(sd->data, initial_boot_params, dtb_len); 229 + 230 + /* Add setup data */ 231 + setup_data_phys = params_load_addr + dtb_setup_data_offset; 232 + sd->next = params->hdr.setup_data; 233 + params->hdr.setup_data = setup_data_phys; 234 + } 235 + #endif /* CONFIG_OF_FLATTREE */ 236 + 217 237 static void 218 238 setup_ima_state(const struct kimage *image, struct boot_params *params, 219 239 unsigned long params_load_addr, ··· 358 334 setup_data_offset); 359 335 setup_data_offset += sizeof(struct setup_data) + 360 336 sizeof(struct efi_setup_data); 337 + #endif 338 + 339 + #ifdef CONFIG_OF_FLATTREE 340 + if (image->force_dtb && initial_boot_params) { 341 + setup_dtb(params, params_load_addr, setup_data_offset); 342 + setup_data_offset += sizeof(struct setup_data) + 343 + fdt_totalsize(initial_boot_params); 344 + } else { 345 + pr_debug("Not carrying over DTB, force_dtb = %d\n", 346 + image->force_dtb); 347 + } 361 348 #endif 362 349 363 350 if (IS_ENABLED(CONFIG_IMA_KEXEC)) { ··· 564 529 sizeof(struct setup_data) + 565 530 RNG_SEED_LENGTH; 566 531 532 + #ifdef CONFIG_OF_FLATTREE 533 + if (image->force_dtb && initial_boot_params) 534 + kbuf.bufsz += sizeof(struct setup_data) + 535 + fdt_totalsize(initial_boot_params); 536 + #endif 537 + 567 538 if (IS_ENABLED(CONFIG_IMA_KEXEC)) 568 539 kbuf.bufsz += sizeof(struct setup_data) + 569 540 sizeof(struct ima_setup_data); ··· 578 537 kbuf.bufsz += sizeof(struct setup_data) + 579 538 sizeof(struct kho_data); 580 539 581 - params = kzalloc(kbuf.bufsz, GFP_KERNEL); 540 + params = kvzalloc(kbuf.bufsz, GFP_KERNEL); 582 541 if (!params) 583 542 return ERR_PTR(-ENOMEM); 584 543 efi_map_offset = params_cmdline_sz; ··· 688 647 return ldata; 689 648 690 649 out_free_params: 691 - kfree(params); 650 + kvfree(params); 692 651 return ERR_PTR(ret); 693 652 } 694 653 ··· 700 659 if (!ldata) 701 660 return 0; 702 661 703 - kfree(ldata->bootparams_buf); 662 + kvfree(ldata->bootparams_buf); 704 663 ldata->bootparams_buf = NULL; 705 664 706 665 return 0;

+25 -4

drivers/firmware/efi/efi-init.c

··· 12 12 #include <linux/efi.h> 13 13 #include <linux/fwnode.h> 14 14 #include <linux/init.h> 15 + #include <linux/kexec_handover.h> 15 16 #include <linux/memblock.h> 16 17 #include <linux/mm_types.h> 17 18 #include <linux/of.h> ··· 165 164 pr_info("Processing EFI memory map:\n"); 166 165 167 166 /* 168 - * Discard memblocks discovered so far: if there are any at this 169 - * point, they originate from memory nodes in the DT, and UEFI 170 - * uses its own memory map instead. 167 + * Discard memblocks discovered so far except for KHO scratch 168 + * regions. Most memblocks at this point originate from memory nodes 169 + * in the DT and UEFI uses its own memory map instead. However, if 170 + * KHO is enabled, scratch regions, which are good known memory 171 + * must be preserved. 171 172 */ 172 173 memblock_dump_all(); 173 - memblock_remove(0, PHYS_ADDR_MAX); 174 + 175 + if (is_kho_boot()) { 176 + struct memblock_region *r; 177 + 178 + /* Remove all non-KHO regions */ 179 + for_each_mem_region(r) { 180 + if (!memblock_is_kho_scratch(r)) { 181 + memblock_remove(r->base, r->size); 182 + r--; 183 + } 184 + } 185 + } else { 186 + /* 187 + * KHO is disabled. Discard memblocks discovered so far: 188 + * if there are any at this point, they originate from memory 189 + * nodes in the DT, and UEFI uses its own memory map instead. 190 + */ 191 + memblock_remove(0, PHYS_ADDR_MAX); 192 + } 174 193 175 194 for_each_efi_memory_desc(md) { 176 195 paddr = md->phys_addr;

+1 -8

drivers/video/fbdev/core/fbcon.c

··· 279 279 280 280 static bool fbcon_skip_panic(struct fb_info *info) 281 281 { 282 - /* panic_cpu is not exported, and can't be used if built as module. Use 283 - * oops_in_progress instead, but non-fatal oops won't be printed. 284 - */ 285 - #if defined(MODULE) 286 - return (info->skip_panic && unlikely(oops_in_progress)); 287 - #else 288 - return (info->skip_panic && unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID)); 289 - #endif 282 + return (info->skip_panic && unlikely(panic_in_progress())); 290 283 } 291 284 292 285 static inline bool fbcon_is_active(struct vc_data *vc, struct fb_info *info)

+1 -1

fs/cramfs/inode.c

··· 421 421 vm_fault_t vmf; 422 422 unsigned long off = i * PAGE_SIZE; 423 423 vmf = vmf_insert_mixed(vma, vma->vm_start + off, 424 - address + off); 424 + PHYS_PFN(address + off)); 425 425 if (vmf & VM_FAULT_ERROR) 426 426 ret = vm_fault_to_errno(vmf, 0); 427 427 }

+3 -4

fs/fat/dir.c

··· 1209 1209 1210 1210 static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, 1211 1211 int *nr_cluster, struct msdos_dir_entry **de, 1212 - struct buffer_head **bh, loff_t *i_pos) 1212 + struct buffer_head **bh) 1213 1213 { 1214 1214 struct super_block *sb = dir->i_sb; 1215 1215 struct msdos_sb_info *sbi = MSDOS_SB(sb); ··· 1269 1269 get_bh(bhs[n]); 1270 1270 *bh = bhs[n]; 1271 1271 *de = (struct msdos_dir_entry *)((*bh)->b_data + offset); 1272 - *i_pos = fat_make_i_pos(sb, *bh, *de); 1273 1272 1274 1273 /* Second stage: clear the rest of cluster, and write outs */ 1275 1274 err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE); ··· 1297 1298 struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ 1298 1299 struct msdos_dir_entry *de; 1299 1300 int err, free_slots, i, nr_bhs; 1300 - loff_t pos, i_pos; 1301 + loff_t pos; 1301 1302 1302 1303 sinfo->nr_slots = nr_slots; 1303 1304 ··· 1385 1386 * add the cluster to dir. 1386 1387 */ 1387 1388 cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster, 1388 - &de, &bh, &i_pos); 1389 + &de, &bh); 1389 1390 if (cluster < 0) { 1390 1391 err = cluster; 1391 1392 goto error_remove;

+1 -2

fs/ocfs2/alloc.c

··· 6928 6928 6929 6929 out: 6930 6930 if (ret != 0) { 6931 - if (folios) 6932 - ocfs2_unlock_and_free_folios(folios, numfolios); 6931 + ocfs2_unlock_and_free_folios(folios, numfolios); 6933 6932 numfolios = 0; 6934 6933 } 6935 6934

-11

fs/ocfs2/dlm/dlmmaster.c

··· 1477 1477 goto send_response; 1478 1478 } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 1479 1479 spin_unlock(&res->spinlock); 1480 - // mlog(0, "node %u is the master\n", res->owner); 1481 1480 response = DLM_MASTER_RESP_NO; 1482 1481 if (mle) 1483 1482 kmem_cache_free(dlm_mle_cache, mle); ··· 1492 1493 BUG(); 1493 1494 } 1494 1495 1495 - // mlog(0, "lockres is in progress...\n"); 1496 1496 spin_lock(&dlm->master_lock); 1497 1497 found = dlm_find_mle(dlm, &tmpmle, name, namelen); 1498 1498 if (!found) { ··· 1501 1503 set_maybe = 1; 1502 1504 spin_lock(&tmpmle->spinlock); 1503 1505 if (tmpmle->type == DLM_MLE_BLOCK) { 1504 - // mlog(0, "this node is waiting for " 1505 - // "lockres to be mastered\n"); 1506 1506 response = DLM_MASTER_RESP_NO; 1507 1507 } else if (tmpmle->type == DLM_MLE_MIGRATION) { 1508 1508 mlog(0, "node %u is master, but trying to migrate to " ··· 1527 1531 } else 1528 1532 response = DLM_MASTER_RESP_NO; 1529 1533 } else { 1530 - // mlog(0, "this node is attempting to " 1531 - // "master lockres\n"); 1532 1534 response = DLM_MASTER_RESP_MAYBE; 1533 1535 } 1534 1536 if (set_maybe) ··· 1553 1559 found = dlm_find_mle(dlm, &tmpmle, name, namelen); 1554 1560 if (!found) { 1555 1561 /* this lockid has never been seen on this node yet */ 1556 - // mlog(0, "no mle found\n"); 1557 1562 if (!mle) { 1558 1563 spin_unlock(&dlm->master_lock); 1559 1564 spin_unlock(&dlm->spinlock); ··· 1566 1573 goto way_up_top; 1567 1574 } 1568 1575 1569 - // mlog(0, "this is second time thru, already allocated, " 1570 - // "add the block.\n"); 1571 1576 dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); 1572 1577 set_bit(request->node_idx, mle->maybe_map); 1573 1578 __dlm_insert_mle(dlm, mle); ··· 1888 1897 spin_unlock(&res->spinlock); 1889 1898 } 1890 1899 1891 - // mlog(0, "woo! got an assert_master from node %u!\n", 1892 - // assert->node_idx); 1893 1900 if (mle) { 1894 1901 int extra_ref = 0; 1895 1902 int nn = -1;

-1

fs/ocfs2/dlm/dlmrecovery.c

··· 464 464 } 465 465 466 466 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { 467 - // mlog(0, "nothing to recover! sleeping now!\n"); 468 467 spin_unlock(&dlm->spinlock); 469 468 /* return to main thread loop and sleep. */ 470 469 return 0;

+8

fs/ocfs2/inode.c

··· 1495 1495 goto bail; 1496 1496 } 1497 1497 1498 + if (le16_to_cpu(di->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && 1499 + (u32)le16_to_cpu(di->i_suballoc_slot) > OCFS2_SB(sb)->max_slots - 1) { 1500 + rc = ocfs2_error(sb, "Invalid dinode %llu: suballoc slot %u\n", 1501 + (unsigned long long)bh->b_blocknr, 1502 + le16_to_cpu(di->i_suballoc_slot)); 1503 + goto bail; 1504 + } 1505 + 1498 1506 rc = 0; 1499 1507 1500 1508 bail:

+7 -11

fs/ocfs2/ioctl.c

··· 358 358 goto bail; 359 359 } 360 360 } else { 361 - ocfs2_sprintf_system_inode_name(namebuf, 362 - sizeof(namebuf), 363 - type, i); 361 + int len = ocfs2_sprintf_system_inode_name(namebuf, 362 + sizeof(namebuf), 363 + type, i); 364 364 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, 365 - namebuf, 366 - strlen(namebuf), 367 - &blkno); 365 + namebuf, len, &blkno); 368 366 if (status < 0) { 369 367 status = -ENOENT; 370 368 goto bail; ··· 649 651 goto bail; 650 652 } 651 653 } else { 652 - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, 653 - OCFS2_INVALID_SLOT); 654 + int len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), 655 + type, OCFS2_INVALID_SLOT); 654 656 status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, 655 - namebuf, 656 - strlen(namebuf), 657 - &blkno); 657 + namebuf, len, &blkno); 658 658 if (status < 0) { 659 659 status = -ENOENT; 660 660 goto bail;

+4 -4

fs/ocfs2/move_extents.c

··· 364 364 int *vict_bit, 365 365 struct buffer_head **ret_bh) 366 366 { 367 - int ret, i, bits_per_unit = 0; 367 + int ret, i, len, bits_per_unit = 0; 368 368 u64 blkno; 369 369 char namebuf[40]; 370 370 ··· 375 375 struct ocfs2_dinode *ac_dinode; 376 376 struct ocfs2_group_desc *bg; 377 377 378 - ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); 379 - ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, 380 - strlen(namebuf), &blkno); 378 + len = ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); 379 + ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, len, &blkno); 380 + 381 381 if (ret) { 382 382 ret = -ENOENT; 383 383 goto out;

+1 -1

fs/ocfs2/ocfs2_fs.h

··· 614 614 __le16 s_reserved0; 615 615 __le32 s_dx_seed[3]; /* seed[0-2] for dx dir hash. 616 616 * s_uuid_hash serves as seed[3]. */ 617 - /*C0*/ __le64 s_reserved2[15]; /* Fill out superblock */ 617 + /*C8*/ __le64 s_reserved2[15]; /* Fill out superblock */ 618 618 /*140*/ 619 619 620 620 /*

+1

fs/ocfs2/stack_user.c

··· 1011 1011 printk(KERN_ERR "ocfs2: Could not determine" 1012 1012 " locking version\n"); 1013 1013 user_cluster_disconnect(conn); 1014 + lc = NULL; 1014 1015 goto out; 1015 1016 } 1016 1017 wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));

+6 -6

fs/ocfs2/sysfile.c

··· 127 127 char namebuf[40]; 128 128 struct inode *inode = NULL; 129 129 u64 blkno; 130 - int status = 0; 130 + int len, status = 0; 131 131 132 - ocfs2_sprintf_system_inode_name(namebuf, 133 - sizeof(namebuf), 134 - type, slot); 132 + len = ocfs2_sprintf_system_inode_name(namebuf, 133 + sizeof(namebuf), 134 + type, slot); 135 135 136 - status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, 137 - strlen(namebuf), &blkno); 136 + status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, 137 + namebuf, len, &blkno); 138 138 if (status < 0) { 139 139 goto bail; 140 140 }

+1 -1

fs/proc/base.c

··· 3947 3947 tid = task_pid_nr_ns(task, ns); 3948 3948 if (!tid) 3949 3949 continue; /* The task has just exited. */ 3950 - len = snprintf(name, sizeof(name), "%u", tid); 3950 + len = snprintf(name, sizeof(name), "%d", tid); 3951 3951 if (!proc_fill_cache(file, ctx, name, len, 3952 3952 proc_task_instantiate, task, NULL)) { 3953 3953 /* returning this tgid failed, save it as the first

+126 -11

fs/squashfs/file.c

··· 307 307 all_done: 308 308 *index_block = cur_index_block; 309 309 *index_offset = cur_offset; 310 - *data_block = cur_data_block; 310 + if (data_block) 311 + *data_block = cur_data_block; 311 312 312 313 /* 313 314 * Scale cache index (cache slot entry) to index ··· 325 324 * Get the on-disk location and compressed size of the datablock 326 325 * specified by index. Fill_meta_index() does most of the work. 327 326 */ 328 - static int read_blocklist(struct inode *inode, int index, u64 *block) 327 + static int read_blocklist_ptrs(struct inode *inode, int index, u64 *start, 328 + int *offset, u64 *block) 329 329 { 330 - u64 start; 331 330 long long blks; 332 - int offset; 333 331 __le32 size; 334 - int res = fill_meta_index(inode, index, &start, &offset, block); 332 + int res = fill_meta_index(inode, index, start, offset, block); 335 333 336 - TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset" 337 - " 0x%x, block 0x%llx\n", res, index, start, offset, 338 - *block); 334 + TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset 0x%x, block 0x%llx\n", 335 + res, index, *start, *offset, block ? *block : 0); 339 336 340 337 if (res < 0) 341 338 return res; ··· 345 346 * extra block indexes needed. 346 347 */ 347 348 if (res < index) { 348 - blks = read_indexes(inode->i_sb, index - res, &start, &offset); 349 + blks = read_indexes(inode->i_sb, index - res, start, offset); 349 350 if (blks < 0) 350 351 return (int) blks; 351 - *block += blks; 352 + if (block) 353 + *block += blks; 352 354 } 353 355 354 356 /* 355 357 * Read length of block specified by index. 356 358 */ 357 - res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset, 359 + res = squashfs_read_metadata(inode->i_sb, &size, start, offset, 358 360 sizeof(size)); 359 361 if (res < 0) 360 362 return res; 361 363 return squashfs_block_size(size); 364 + } 365 + 366 + static inline int read_blocklist(struct inode *inode, int index, u64 *block) 367 + { 368 + u64 start; 369 + int offset; 370 + 371 + return read_blocklist_ptrs(inode, index, &start, &offset, block); 362 372 } 363 373 364 374 static bool squashfs_fill_page(struct folio *folio, ··· 666 658 kfree(pages); 667 659 } 668 660 661 + static loff_t seek_hole_data(struct file *file, loff_t offset, int whence) 662 + { 663 + struct inode *inode = file->f_mapping->host; 664 + struct super_block *sb = inode->i_sb; 665 + struct squashfs_sb_info *msblk = sb->s_fs_info; 666 + u64 start, index = offset >> msblk->block_log; 667 + u64 file_end = (i_size_read(inode) + msblk->block_size - 1) >> msblk->block_log; 668 + int s_offset, length; 669 + __le32 *blist = NULL; 670 + 671 + /* reject offset if negative or beyond file end */ 672 + if ((unsigned long long)offset >= i_size_read(inode)) 673 + return -ENXIO; 674 + 675 + /* is offset within tailend and is tailend packed into a fragment? */ 676 + if (index + 1 == file_end && 677 + squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { 678 + if (whence == SEEK_DATA) 679 + return offset; 680 + 681 + /* there is an implicit hole at the end of any file */ 682 + return i_size_read(inode); 683 + } 684 + 685 + length = read_blocklist_ptrs(inode, index, &start, &s_offset, NULL); 686 + if (length < 0) 687 + return length; 688 + 689 + /* nothing more to do if offset matches desired whence value */ 690 + if ((length == 0 && whence == SEEK_HOLE) || 691 + (length && whence == SEEK_DATA)) 692 + return offset; 693 + 694 + /* skip scanning forwards if we're at file end */ 695 + if (++ index == file_end) 696 + goto not_found; 697 + 698 + blist = kmalloc(SQUASHFS_SCAN_INDEXES << 2, GFP_KERNEL); 699 + if (blist == NULL) { 700 + ERROR("%s: Failed to allocate block_list\n", __func__); 701 + return -ENOMEM; 702 + } 703 + 704 + while (index < file_end) { 705 + int i, indexes = min(file_end - index, SQUASHFS_SCAN_INDEXES); 706 + 707 + offset = squashfs_read_metadata(sb, blist, &start, &s_offset, indexes << 2); 708 + if (offset < 0) 709 + goto finished; 710 + 711 + for (i = 0; i < indexes; i++) { 712 + length = squashfs_block_size(blist[i]); 713 + if (length < 0) { 714 + offset = length; 715 + goto finished; 716 + } 717 + 718 + /* does this block match desired whence value? */ 719 + if ((length == 0 && whence == SEEK_HOLE) || 720 + (length && whence == SEEK_DATA)) { 721 + offset = (index + i) << msblk->block_log; 722 + goto finished; 723 + } 724 + } 725 + 726 + index += indexes; 727 + } 728 + 729 + not_found: 730 + /* whence value determines what happens */ 731 + if (whence == SEEK_DATA) 732 + offset = -ENXIO; 733 + else 734 + /* there is an implicit hole at the end of any file */ 735 + offset = i_size_read(inode); 736 + 737 + finished: 738 + kfree(blist); 739 + return offset; 740 + } 741 + 742 + static loff_t squashfs_llseek(struct file *file, loff_t offset, int whence) 743 + { 744 + struct inode *inode = file->f_mapping->host; 745 + 746 + switch (whence) { 747 + default: 748 + return generic_file_llseek(file, offset, whence); 749 + case SEEK_DATA: 750 + case SEEK_HOLE: 751 + offset = seek_hole_data(file, offset, whence); 752 + break; 753 + } 754 + 755 + if (offset < 0) 756 + return offset; 757 + 758 + return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 759 + } 760 + 669 761 const struct address_space_operations squashfs_aops = { 670 762 .read_folio = squashfs_read_folio, 671 763 .readahead = squashfs_readahead 764 + }; 765 + 766 + const struct file_operations squashfs_file_operations = { 767 + .llseek = squashfs_llseek, 768 + .read_iter = generic_file_read_iter, 769 + .mmap_prepare = generic_file_readonly_mmap_prepare, 770 + .splice_read = filemap_splice_read 672 771 };

+35 -4

fs/squashfs/inode.c

··· 68 68 inode->i_mode = le16_to_cpu(sqsh_ino->mode); 69 69 inode->i_size = 0; 70 70 71 + /* File type must not be set at this moment, for it will later be set by the caller. */ 72 + if (inode->i_mode & S_IFMT) 73 + err = -EIO; 74 + 71 75 return err; 72 76 } 73 77 ··· 144 140 if (err < 0) 145 141 goto failed_read; 146 142 143 + inode->i_size = le32_to_cpu(sqsh_ino->file_size); 147 144 frag = le32_to_cpu(sqsh_ino->fragment); 148 145 if (frag != SQUASHFS_INVALID_FRAG) { 146 + /* 147 + * the file cannot have a fragment (tailend) and have a 148 + * file size a multiple of the block size 149 + */ 150 + if ((inode->i_size & (msblk->block_size - 1)) == 0) { 151 + err = -EINVAL; 152 + goto failed_read; 153 + } 149 154 frag_offset = le32_to_cpu(sqsh_ino->offset); 150 155 frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); 151 156 if (frag_size < 0) { ··· 168 155 } 169 156 170 157 set_nlink(inode, 1); 171 - inode->i_size = le32_to_cpu(sqsh_ino->file_size); 172 - inode->i_fop = &generic_ro_fops; 158 + inode->i_fop = &squashfs_file_operations; 173 159 inode->i_mode |= S_IFREG; 174 160 inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; 175 161 squashfs_i(inode)->fragment_block = frag_blk; ··· 177 165 squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); 178 166 squashfs_i(inode)->block_list_start = block; 179 167 squashfs_i(inode)->offset = offset; 168 + squashfs_i(inode)->parent = 0; 180 169 inode->i_data.a_ops = &squashfs_aops; 181 170 182 171 TRACE("File inode %x:%x, start_block %llx, block_list_start " ··· 196 183 if (err < 0) 197 184 goto failed_read; 198 185 186 + inode->i_size = le64_to_cpu(sqsh_ino->file_size); 187 + if (inode->i_size < 0) { 188 + err = -EINVAL; 189 + goto failed_read; 190 + } 199 191 frag = le32_to_cpu(sqsh_ino->fragment); 200 192 if (frag != SQUASHFS_INVALID_FRAG) { 193 + /* 194 + * the file cannot have a fragment (tailend) and have a 195 + * file size a multiple of the block size 196 + */ 197 + if ((inode->i_size & (msblk->block_size - 1)) == 0) { 198 + err = -EINVAL; 199 + goto failed_read; 200 + } 201 201 frag_offset = le32_to_cpu(sqsh_ino->offset); 202 202 frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); 203 203 if (frag_size < 0) { ··· 225 199 226 200 xattr_id = le32_to_cpu(sqsh_ino->xattr); 227 201 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); 228 - inode->i_size = le64_to_cpu(sqsh_ino->file_size); 229 202 inode->i_op = &squashfs_inode_ops; 230 - inode->i_fop = &generic_ro_fops; 203 + inode->i_fop = &squashfs_file_operations; 231 204 inode->i_mode |= S_IFREG; 232 205 inode->i_blocks = (inode->i_size - 233 206 le64_to_cpu(sqsh_ino->sparse) + 511) >> 9; ··· 237 212 squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); 238 213 squashfs_i(inode)->block_list_start = block; 239 214 squashfs_i(inode)->offset = offset; 215 + squashfs_i(inode)->parent = 0; 240 216 inode->i_data.a_ops = &squashfs_aops; 241 217 242 218 TRACE("File inode %x:%x, start_block %llx, block_list_start " ··· 318 292 inode->i_mode |= S_IFLNK; 319 293 squashfs_i(inode)->start = block; 320 294 squashfs_i(inode)->offset = offset; 295 + squashfs_i(inode)->parent = 0; 321 296 322 297 if (type == SQUASHFS_LSYMLINK_TYPE) { 323 298 __le32 xattr; ··· 356 329 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); 357 330 rdev = le32_to_cpu(sqsh_ino->rdev); 358 331 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); 332 + squashfs_i(inode)->parent = 0; 359 333 360 334 TRACE("Device inode %x:%x, rdev %x\n", 361 335 SQUASHFS_INODE_BLK(ino), offset, rdev); ··· 381 353 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); 382 354 rdev = le32_to_cpu(sqsh_ino->rdev); 383 355 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); 356 + squashfs_i(inode)->parent = 0; 384 357 385 358 TRACE("Device inode %x:%x, rdev %x\n", 386 359 SQUASHFS_INODE_BLK(ino), offset, rdev); ··· 402 373 inode->i_mode |= S_IFSOCK; 403 374 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); 404 375 init_special_inode(inode, inode->i_mode, 0); 376 + squashfs_i(inode)->parent = 0; 405 377 break; 406 378 } 407 379 case SQUASHFS_LFIFO_TYPE: ··· 422 392 inode->i_op = &squashfs_inode_ops; 423 393 set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); 424 394 init_special_inode(inode, inode->i_mode, 0); 395 + squashfs_i(inode)->parent = 0; 425 396 break; 426 397 } 427 398 default:

+1

fs/squashfs/squashfs.h

··· 107 107 108 108 /* inode.c */ 109 109 extern const struct inode_operations squashfs_inode_ops; 110 + extern const struct file_operations squashfs_file_operations; 110 111 111 112 /* namei.c */ 112 113 extern const struct inode_operations squashfs_dir_inode_ops;

+1

fs/squashfs/squashfs_fs.h

··· 208 208 #define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) 209 209 #define SQUASHFS_META_ENTRIES 127 210 210 #define SQUASHFS_META_SLOTS 8 211 + #define SQUASHFS_SCAN_INDEXES 1024 211 212 212 213 struct meta_entry { 213 214 u64 data_block;

+1 -1

fs/squashfs/squashfs_fs_i.h

··· 16 16 u64 xattr; 17 17 unsigned int xattr_size; 18 18 int xattr_count; 19 + int parent; 19 20 union { 20 21 struct { 21 22 u64 fragment_block; ··· 28 27 u64 dir_idx_start; 29 28 int dir_idx_offset; 30 29 int dir_idx_cnt; 31 - int parent; 32 30 }; 33 31 }; 34 32 struct inode vfs_inode;

-8

include/linux/idr.h

··· 334 334 xa_init_flags(&ida->xa, IDA_INIT_FLAGS); 335 335 } 336 336 337 - /* 338 - * ida_simple_get() and ida_simple_remove() are deprecated. Use 339 - * ida_alloc() and ida_free() instead respectively. 340 - */ 341 - #define ida_simple_get(ida, start, end, gfp) \ 342 - ida_alloc_range(ida, start, (end) - 1, gfp) 343 - #define ida_simple_remove(ida, id) ida_free(ida, id) 344 - 345 337 static inline bool ida_is_empty(const struct ida *ida) 346 338 { 347 339 return xa_empty(&ida->xa);

+17 -4

include/linux/kernel.h

··· 164 164 165 165 extern bool early_boot_irqs_disabled; 166 166 167 - /* 168 - * Values used for system_state. Ordering of the states must not be changed 167 + /** 168 + * enum system_states - Values used for system_state. 169 + * 170 + * @SYSTEM_BOOTING: %0, no init needed 171 + * @SYSTEM_SCHEDULING: system is ready for scheduling; OK to use RCU 172 + * @SYSTEM_FREEING_INITMEM: system is freeing all of initmem; almost running 173 + * @SYSTEM_RUNNING: system is up and running 174 + * @SYSTEM_HALT: system entered clean system halt state 175 + * @SYSTEM_POWER_OFF: system entered shutdown/clean power off state 176 + * @SYSTEM_RESTART: system entered emergency power off or normal restart 177 + * @SYSTEM_SUSPEND: system entered suspend or hibernate state 178 + * 179 + * Note: 180 + * Ordering of the states must not be changed 169 181 * as code checks for <, <=, >, >= STATE. 170 182 */ 171 - extern enum system_states { 183 + enum system_states { 172 184 SYSTEM_BOOTING, 173 185 SYSTEM_SCHEDULING, 174 186 SYSTEM_FREEING_INITMEM, ··· 189 177 SYSTEM_POWER_OFF, 190 178 SYSTEM_RESTART, 191 179 SYSTEM_SUSPEND, 192 - } system_state; 180 + }; 181 + extern enum system_states system_state; 193 182 194 183 /* 195 184 * General tracing related utility functions - trace_printk(),

+4 -1

include/linux/kexec.h

··· 395 395 396 396 /* Information for loading purgatory */ 397 397 struct purgatory_info purgatory_info; 398 + 399 + /* Force carrying over the DTB from the current boot */ 400 + bool force_dtb; 398 401 #endif 399 402 400 403 #ifdef CONFIG_CRASH_HOTPLUG ··· 464 461 /* List of defined/legal kexec file flags */ 465 462 #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ 466 463 KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ 467 - KEXEC_FILE_NO_CMA) 464 + KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB) 468 465 469 466 /* flag to track if kexec reboot is in progress */ 470 467 extern bool kexec_in_progress;

+6

include/linux/kexec_handover.h

··· 40 40 41 41 #ifdef CONFIG_KEXEC_HANDOVER 42 42 bool kho_is_enabled(void); 43 + bool is_kho_boot(void); 43 44 44 45 int kho_preserve_folio(struct folio *folio); 45 46 int kho_preserve_phys(phys_addr_t phys, size_t size); ··· 57 56 u64 scratch_len); 58 57 #else 59 58 static inline bool kho_is_enabled(void) 59 + { 60 + return false; 61 + } 62 + 63 + static inline bool is_kho_boot(void) 60 64 { 61 65 return false; 62 66 }

+8

include/linux/list.h

··· 20 20 * using the generic single-entry routines. 21 21 */ 22 22 23 + /** 24 + * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself 25 + * @name: name of the list_head 26 + */ 23 27 #define LIST_HEAD_INIT(name) { &(name), &(name) } 24 28 29 + /** 30 + * LIST_HEAD - definition of a &struct list_head with initialization values 31 + * @name: name of the list_head 32 + */ 25 33 #define LIST_HEAD(name) \ 26 34 struct list_head name = LIST_HEAD_INIT(name) 27 35

+13

include/linux/moduleparam.h

··· 349 349 __module_param_call("", name, &param_ops_##type, &var, perm, \ 350 350 -1, KERNEL_PARAM_FL_UNSAFE) 351 351 352 + /** 353 + * __core_param_cb - similar like core_param, with a set/get ops instead of type. 354 + * @name: the name of the cmdline and sysfs parameter (often the same as var) 355 + * @var: the variable 356 + * @ops: the set & get operations for this parameter. 357 + * @perm: visibility in sysfs 358 + * 359 + * Ideally this should be called 'core_param_cb', but the name has been 360 + * used for module core parameter, so add the '__' prefix 361 + */ 362 + #define __core_param_cb(name, ops, arg, perm) \ 363 + __module_param_call("", name, ops, arg, perm, -1, 0) 364 + 352 365 #endif /* !MODULE */ 353 366 354 367 /**

+1 -1

include/linux/nvmem-provider.h

··· 103 103 * 104 104 * Note: A default "nvmem<id>" name will be assigned to the device if 105 105 * no name is specified in its configuration. In such case "<id>" is 106 - * generated with ida_simple_get() and provided id field is ignored. 106 + * generated with ida_alloc() and provided id field is ignored. 107 107 * 108 108 * Note: Specifying name and setting id to -1 implies a unique device 109 109 * whose name is provided as-is (kept unaltered).

+6

include/linux/panic.h

··· 43 43 extern atomic_t panic_cpu; 44 44 #define PANIC_CPU_INVALID -1 45 45 46 + bool panic_try_start(void); 47 + void panic_reset(void); 48 + bool panic_in_progress(void); 49 + bool panic_on_this_cpu(void); 50 + bool panic_on_other_cpu(void); 51 + 46 52 /* 47 53 * Only to be used by arch init code. If the user over-wrote the default 48 54 * CONFIG_PANIC_TIMEOUT, honor it.

-2

include/linux/printk.h

··· 330 330 331 331 #endif 332 332 333 - bool this_cpu_in_panic(void); 334 - 335 333 #ifdef CONFIG_SMP 336 334 extern int __printk_cpu_sync_try_get(void); 337 335 extern void __printk_cpu_sync_wait(void);

+2 -3

include/linux/sched/task.h

··· 210 210 * pins the final release of task.io_context. Also protects ->cpuset and 211 211 * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. 212 212 * 213 - * Nests both inside and outside of read_lock(&tasklist_lock). 214 - * It must not be nested with write_lock_irq(&tasklist_lock), 215 - * neither inside nor outside. 213 + * Nests inside of read_lock(&tasklist_lock). It must not be nested with 214 + * write_lock_irq(&tasklist_lock), neither inside nor outside. 216 215 */ 217 216 static inline void task_lock(struct task_struct *p) 218 217 {

+4

include/uapi/linux/kexec.h

··· 22 22 * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. 23 23 * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd 24 24 * fd field. 25 + * KEXEC_FILE_FORCE_DTB : Force carrying over the current boot's DTB to the new 26 + * kernel on x86. This is already the default behavior on 27 + * some other architectures, like ARM64 and PowerPC. 25 28 */ 26 29 #define KEXEC_FILE_UNLOAD 0x00000001 27 30 #define KEXEC_FILE_ON_CRASH 0x00000002 28 31 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 29 32 #define KEXEC_FILE_DEBUG 0x00000008 30 33 #define KEXEC_FILE_NO_CMA 0x00000010 34 + #define KEXEC_FILE_FORCE_DTB 0x00000020 31 35 32 36 /* These values match the ELF architecture values. 33 37 * Unless there is a good reason that should continue to be the case.

+12

init/main.c

··· 545 545 const char *unused, void *arg) 546 546 { 547 547 size_t len = strlen(param); 548 + /* 549 + * Well-known bootloader identifiers: 550 + * 1. LILO/Grub pass "BOOT_IMAGE=..."; 551 + * 2. kexec/kdump (kexec-tools) pass "kexec". 552 + */ 553 + const char *bootloader[] = { "BOOT_IMAGE=", "kexec", NULL }; 548 554 549 555 /* Handle params aliased to sysctls */ 550 556 if (sysctl_is_alias(param)) 551 557 return 0; 552 558 553 559 repair_env_string(param, val); 560 + 561 + /* Handle bootloader identifier */ 562 + for (int i = 0; bootloader[i]; i++) { 563 + if (strstarts(param, bootloader[i])) 564 + return 0; 565 + } 554 566 555 567 /* Handle obsolete-style parameters */ 556 568 if (obsolete_checksetup(param))

+11

kernel/Kconfig.kexec

··· 148 148 CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that 149 149 is required to be built-in. 150 150 151 + config CRASH_DUMP_KUNIT_TEST 152 + tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS 153 + depends on CRASH_DUMP && KUNIT 154 + default KUNIT_ALL_TESTS 155 + help 156 + This option builds KUnit unit tests for kernel crash dumps. The unit 157 + tests will be used to verify the correctness of covered functions and 158 + also prevent any regression. 159 + 160 + If unsure, say N. 161 + 151 162 config CRASH_HOTPLUG 152 163 bool "Update the crash elfcorehdr on system configuration changes" 153 164 default y

+1

kernel/Makefile

··· 78 78 obj-$(CONFIG_KEXEC_CORE) += kexec_core.o 79 79 obj-$(CONFIG_CRASH_DUMP) += crash_core.o 80 80 obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o 81 + obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o 81 82 obj-$(CONFIG_KEXEC) += kexec.o 82 83 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o 83 84 obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o

+18 -12

kernel/crash_core.c

··· 22 22 #include <linux/btf.h> 23 23 #include <linux/objtool.h> 24 24 #include <linux/delay.h> 25 + #include <linux/panic.h> 25 26 26 27 #include <asm/page.h> 27 28 #include <asm/sections.h> ··· 144 143 145 144 __bpf_kfunc void crash_kexec(struct pt_regs *regs) 146 145 { 147 - int old_cpu, this_cpu; 148 - 149 - /* 150 - * Only one CPU is allowed to execute the crash_kexec() code as with 151 - * panic(). Otherwise parallel calls of panic() and crash_kexec() 152 - * may stop each other. To exclude them, we use panic_cpu here too. 153 - */ 154 - old_cpu = PANIC_CPU_INVALID; 155 - this_cpu = raw_smp_processor_id(); 156 - 157 - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { 146 + if (panic_try_start()) { 158 147 /* This is the 1st CPU which comes here, so go ahead. */ 159 148 __crash_kexec(regs); 160 149 ··· 152 161 * Reset panic_cpu to allow another panic()/crash_kexec() 153 162 * call. 154 163 */ 155 - atomic_set(&panic_cpu, PANIC_CPU_INVALID); 164 + panic_reset(); 156 165 } 157 166 } 158 167 ··· 265 274 return 0; 266 275 } 267 276 277 + /** 278 + * crash_exclude_mem_range - exclude a mem range for existing ranges 279 + * @mem: mem->range contains an array of ranges sorted in ascending order 280 + * @mstart: the start of to-be-excluded range 281 + * @mend: the start of to-be-excluded range 282 + * 283 + * If you are unsure if a range split will happen, to avoid function call 284 + * failure because of -ENOMEM, always make sure 285 + * mem->max_nr_ranges == mem->nr_ranges + 1 286 + * before calling the function each time. 287 + * 288 + * returns 0 if a memory range is excluded successfully 289 + * return -ENOMEM if mem->ranges doesn't have space to hold split ranges 290 + */ 268 291 int crash_exclude_mem_range(struct crash_mem *mem, 269 292 unsigned long long mstart, unsigned long long mend) 270 293 { ··· 338 333 339 334 return 0; 340 335 } 336 + EXPORT_SYMBOL_GPL(crash_exclude_mem_range); 341 337 342 338 ssize_t crash_get_memory_size(void) 343 339 {

+343

kernel/crash_core_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #include <kunit/test.h> 3 + #include <linux/crash_core.h> // For struct crash_mem and struct range if defined there 4 + 5 + // Helper to create and initialize crash_mem 6 + static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges, 7 + unsigned int nr_initial_ranges, 8 + const struct range *initial_ranges) 9 + { 10 + struct crash_mem *mem; 11 + size_t alloc_size; 12 + 13 + // Check if max_ranges can even hold initial_ranges 14 + if (max_ranges < nr_initial_ranges) { 15 + kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n", 16 + max_ranges, nr_initial_ranges); 17 + return NULL; 18 + } 19 + 20 + alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range); 21 + mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL); 22 + if (!mem) { 23 + kunit_err(test, "Failed to allocate crash_mem\n"); 24 + return NULL; 25 + } 26 + 27 + mem->max_nr_ranges = max_ranges; 28 + mem->nr_ranges = nr_initial_ranges; 29 + if (initial_ranges && nr_initial_ranges > 0) { 30 + memcpy(mem->ranges, initial_ranges, 31 + nr_initial_ranges * sizeof(struct range)); 32 + } 33 + 34 + return mem; 35 + } 36 + 37 + // Helper to compare ranges for assertions 38 + static void assert_ranges_equal(struct kunit *test, 39 + const struct range *actual_ranges, 40 + unsigned int actual_nr_ranges, 41 + const struct range *expected_ranges, 42 + unsigned int expected_nr_ranges, 43 + const char *case_name) 44 + { 45 + unsigned int i; 46 + 47 + KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges, 48 + "%s: Number of ranges mismatch.", case_name); 49 + 50 + for (i = 0; i < expected_nr_ranges; i++) { 51 + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start, 52 + "%s: Range %u start mismatch.", case_name, i); 53 + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end, 54 + "%s: Range %u end mismatch.", case_name, i); 55 + } 56 + } 57 + 58 + // Structure for test parameters 59 + struct exclude_test_param { 60 + const char *description; 61 + unsigned long long exclude_start; 62 + unsigned long long exclude_end; 63 + unsigned int initial_max_ranges; 64 + const struct range *initial_ranges; 65 + unsigned int initial_nr_ranges; 66 + const struct range *expected_ranges; 67 + unsigned int expected_nr_ranges; 68 + int expected_ret; 69 + }; 70 + 71 + static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params) 72 + { 73 + struct crash_mem *mem; 74 + int ret; 75 + 76 + kunit_info(test, "%s", params->description); 77 + 78 + mem = create_crash_mem(test, params->initial_max_ranges, 79 + params->initial_nr_ranges, params->initial_ranges); 80 + if (!mem) 81 + return; // Error already logged by create_crash_mem or kunit_kzalloc 82 + 83 + ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end); 84 + 85 + KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret, 86 + "%s: Return value mismatch.", params->description); 87 + 88 + if (params->expected_ret == 0) { 89 + assert_ranges_equal(test, mem->ranges, mem->nr_ranges, 90 + params->expected_ranges, params->expected_nr_ranges, 91 + params->description); 92 + } else { 93 + // If an error is expected, nr_ranges might still be relevant to check 94 + // depending on the exact point of failure. For ENOMEM on split, 95 + // nr_ranges shouldn't have changed. 96 + KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges, 97 + mem->nr_ranges, 98 + "%s: Number of ranges mismatch on error.", 99 + params->description); 100 + } 101 + } 102 + 103 + /* 104 + * Test Strategy 1: One to-be-excluded range A and one existing range B. 105 + * 106 + * Exhaust all possibilities of the position of A regarding B. 107 + */ 108 + 109 + static const struct range single_range_b = { .start = 100, .end = 199 }; 110 + 111 + static const struct exclude_test_param exclude_single_range_test_data[] = { 112 + { 113 + .description = "1.1: A is left of B, no overlap", 114 + .exclude_start = 10, .exclude_end = 50, 115 + .initial_max_ranges = 1, 116 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 117 + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, 118 + .expected_ret = 0, 119 + }, 120 + { 121 + .description = "1.2: A's right boundary touches B's left boundary", 122 + .exclude_start = 10, .exclude_end = 99, 123 + .initial_max_ranges = 1, 124 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 125 + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, 126 + .expected_ret = 0, 127 + }, 128 + { 129 + .description = "1.3: A overlaps B's left part", 130 + .exclude_start = 50, .exclude_end = 149, 131 + .initial_max_ranges = 1, 132 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 133 + .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }}, 134 + .expected_nr_ranges = 1, 135 + .expected_ret = 0, 136 + }, 137 + { 138 + .description = "1.4: A is completely inside B", 139 + .exclude_start = 120, .exclude_end = 179, 140 + .initial_max_ranges = 2, // Needs space for split 141 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 142 + .expected_ranges = (const struct range[]){ 143 + { .start = 100, .end = 119 }, 144 + { .start = 180, .end = 199 } 145 + }, 146 + .expected_nr_ranges = 2, 147 + .expected_ret = 0, 148 + }, 149 + { 150 + .description = "1.5: A overlaps B's right part", 151 + .exclude_start = 150, .exclude_end = 249, 152 + .initial_max_ranges = 1, 153 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 154 + .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }}, 155 + .expected_nr_ranges = 1, 156 + .expected_ret = 0, 157 + }, 158 + { 159 + .description = "1.6: A's left boundary touches B's right boundary", 160 + .exclude_start = 200, .exclude_end = 250, 161 + .initial_max_ranges = 1, 162 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 163 + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, 164 + .expected_ret = 0, 165 + }, 166 + { 167 + .description = "1.7: A is right of B, no overlap", 168 + .exclude_start = 250, .exclude_end = 300, 169 + .initial_max_ranges = 1, 170 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 171 + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, 172 + .expected_ret = 0, 173 + }, 174 + { 175 + .description = "1.8: A completely covers B and extends beyond", 176 + .exclude_start = 50, .exclude_end = 250, 177 + .initial_max_ranges = 1, 178 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 179 + .expected_ranges = NULL, .expected_nr_ranges = 0, 180 + .expected_ret = 0, 181 + }, 182 + { 183 + .description = "1.9: A covers B and extends to the left", 184 + .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends 185 + .initial_max_ranges = 1, 186 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 187 + .expected_ranges = NULL, .expected_nr_ranges = 0, 188 + .expected_ret = 0, 189 + }, 190 + { 191 + .description = "1.10: A covers B and extends to the right", 192 + .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts 193 + .initial_max_ranges = 1, 194 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 195 + .expected_ranges = NULL, .expected_nr_ranges = 0, 196 + .expected_ret = 0, 197 + }, 198 + { 199 + .description = "1.11: A is identical to B", 200 + .exclude_start = 100, .exclude_end = 199, 201 + .initial_max_ranges = 1, 202 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 203 + .expected_ranges = NULL, .expected_nr_ranges = 0, 204 + .expected_ret = 0, 205 + }, 206 + { 207 + .description = "1.12: A is a point, left of B, no overlap", 208 + .exclude_start = 10, .exclude_end = 10, 209 + .initial_max_ranges = 1, 210 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 211 + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, 212 + .expected_ret = 0, 213 + }, 214 + { 215 + .description = "1.13: A is a point, at start of B", 216 + .exclude_start = 100, .exclude_end = 100, 217 + .initial_max_ranges = 1, 218 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 219 + .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }}, 220 + .expected_nr_ranges = 1, 221 + .expected_ret = 0, 222 + }, 223 + { 224 + .description = "1.14: A is a point, in middle of B (causes split)", 225 + .exclude_start = 150, .exclude_end = 150, 226 + .initial_max_ranges = 2, // Needs space for split 227 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 228 + .expected_ranges = (const struct range[]){ 229 + { .start = 100, .end = 149 }, 230 + { .start = 151, .end = 199 } 231 + }, 232 + .expected_nr_ranges = 2, 233 + .expected_ret = 0, 234 + }, 235 + { 236 + .description = "1.15: A is a point, at end of B", 237 + .exclude_start = 199, .exclude_end = 199, 238 + .initial_max_ranges = 1, 239 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 240 + .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }}, 241 + .expected_nr_ranges = 1, 242 + .expected_ret = 0, 243 + }, 244 + { 245 + .description = "1.16: A is a point, right of B, no overlap", 246 + .exclude_start = 250, .exclude_end = 250, 247 + .initial_max_ranges = 1, 248 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 249 + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, 250 + .expected_ret = 0, 251 + }, 252 + // ENOMEM case for single range split 253 + { 254 + .description = "1.17: A completely inside B (split), no space (ENOMEM)", 255 + .exclude_start = 120, .exclude_end = 179, 256 + .initial_max_ranges = 1, // Not enough for split 257 + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, 258 + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content 259 + .expected_nr_ranges = 1, // Should remain unchanged 260 + .expected_ret = -ENOMEM, 261 + }, 262 + }; 263 + 264 + 265 + static void exclude_single_range_test(struct kunit *test) 266 + { 267 + size_t i; 268 + 269 + for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) { 270 + kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description); 271 + run_exclude_test_case(test, &exclude_single_range_test_data[i]); 272 + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case 273 + } 274 + } 275 + 276 + /* 277 + * Test Strategy 2: Regression test. 278 + */ 279 + 280 + static const struct exclude_test_param exclude_range_regression_test_data[] = { 281 + // Test data from commit a2e9a95d2190 282 + { 283 + .description = "2.1: exclude low 1M", 284 + .exclude_start = 0, .exclude_end = (1 << 20) - 1, 285 + .initial_max_ranges = 3, 286 + .initial_ranges = (const struct range[]){ 287 + { .start = 0, .end = 0x3efff }, 288 + { .start = 0x3f000, .end = 0x3ffff }, 289 + { .start = 0x40000, .end = 0x9ffff } 290 + }, 291 + .initial_nr_ranges = 3, 292 + .expected_nr_ranges = 0, 293 + .expected_ret = 0, 294 + }, 295 + // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u 296 + { 297 + .description = "2.2: when range out of bound", 298 + .exclude_start = 100, .exclude_end = 200, 299 + .initial_max_ranges = 3, 300 + .initial_ranges = (const struct range[]){ 301 + { .start = 1, .end = 299 }, 302 + { .start = 401, .end = 1000 }, 303 + { .start = 1001, .end = 2000 } 304 + }, 305 + .initial_nr_ranges = 3, 306 + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content 307 + .expected_nr_ranges = 3, // Should remain unchanged 308 + .expected_ret = -ENOMEM 309 + }, 310 + 311 + }; 312 + 313 + 314 + static void exclude_range_regression_test(struct kunit *test) 315 + { 316 + size_t i; 317 + 318 + for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) { 319 + kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description); 320 + run_exclude_test_case(test, &exclude_range_regression_test_data[i]); 321 + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case 322 + } 323 + } 324 + 325 + /* 326 + * KUnit Test Suite 327 + */ 328 + static struct kunit_case crash_exclude_mem_range_test_cases[] = { 329 + KUNIT_CASE(exclude_single_range_test), 330 + KUNIT_CASE(exclude_range_regression_test), 331 + {} 332 + }; 333 + 334 + static struct kunit_suite crash_exclude_mem_range_suite = { 335 + .name = "crash_exclude_mem_range_tests", 336 + .test_cases = crash_exclude_mem_range_test_cases, 337 + // .init and .exit can be NULL if not needed globally for the suite 338 + }; 339 + 340 + kunit_test_suite(crash_exclude_mem_range_suite); 341 + 342 + MODULE_DESCRIPTION("crash dump KUnit test suite"); 343 + MODULE_LICENSE("GPL");

+6 -12

kernel/fork.c

··· 2132 2132 2133 2133 p->pagefault_disabled = 0; 2134 2134 2135 - #ifdef CONFIG_LOCKDEP 2136 2135 lockdep_init_task(p); 2137 - #endif 2138 2136 2139 2137 p->blocked_on = NULL; /* not blocked yet */ 2140 2138 ··· 2545 2547 struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) 2546 2548 { 2547 2549 unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| 2548 - CLONE_IO; 2550 + CLONE_IO|CLONE_VM|CLONE_UNTRACED; 2549 2551 struct kernel_clone_args args = { 2550 - .flags = ((lower_32_bits(flags) | CLONE_VM | 2551 - CLONE_UNTRACED) & ~CSIGNAL), 2552 - .exit_signal = (lower_32_bits(flags) & CSIGNAL), 2552 + .flags = flags, 2553 2553 .fn = fn, 2554 2554 .fn_arg = arg, 2555 2555 .io_thread = 1, ··· 2659 2663 unsigned long flags) 2660 2664 { 2661 2665 struct kernel_clone_args args = { 2662 - .flags = ((lower_32_bits(flags) | CLONE_VM | 2663 - CLONE_UNTRACED) & ~CSIGNAL), 2664 - .exit_signal = (lower_32_bits(flags) & CSIGNAL), 2666 + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), 2667 + .exit_signal = (flags & CSIGNAL), 2665 2668 .fn = fn, 2666 2669 .fn_arg = arg, 2667 2670 .name = name, ··· 2676 2681 pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) 2677 2682 { 2678 2683 struct kernel_clone_args args = { 2679 - .flags = ((lower_32_bits(flags) | CLONE_VM | 2680 - CLONE_UNTRACED) & ~CSIGNAL), 2681 - .exit_signal = (lower_32_bits(flags) & CSIGNAL), 2684 + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), 2685 + .exit_signal = (flags & CSIGNAL), 2682 2686 .fn = fn, 2683 2687 .fn_arg = arg, 2684 2688 };

+41 -37

kernel/hung_task.c

··· 95 95 .notifier_call = hung_task_panic, 96 96 }; 97 97 98 + static bool task_is_hung(struct task_struct *t, unsigned long timeout) 99 + { 100 + unsigned long switch_count = t->nvcsw + t->nivcsw; 101 + unsigned int state = READ_ONCE(t->__state); 102 + 103 + /* 104 + * skip the TASK_KILLABLE tasks -- these can be killed 105 + * skip the TASK_IDLE tasks -- those are genuinely idle 106 + * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer 107 + */ 108 + if (!(state & TASK_UNINTERRUPTIBLE) || 109 + (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN))) 110 + return false; 111 + 112 + /* 113 + * When a freshly created task is scheduled once, changes its state to 114 + * TASK_UNINTERRUPTIBLE without having ever been switched out once, it 115 + * musn't be checked. 116 + */ 117 + if (unlikely(!switch_count)) 118 + return false; 119 + 120 + if (switch_count != t->last_switch_count) { 121 + t->last_switch_count = switch_count; 122 + t->last_switch_time = jiffies; 123 + return false; 124 + } 125 + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) 126 + return false; 127 + 128 + return true; 129 + } 98 130 99 131 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER 100 - static void debug_show_blocker(struct task_struct *task) 132 + static void debug_show_blocker(struct task_struct *task, unsigned long timeout) 101 133 { 102 134 struct task_struct *g, *t; 103 135 unsigned long owner, blocker, blocker_type; ··· 206 174 t->pid, rwsem_blocked_by); 207 175 break; 208 176 } 209 - sched_show_task(t); 177 + /* Avoid duplicated task dump, skip if the task is also hung. */ 178 + if (!task_is_hung(t, timeout)) 179 + sched_show_task(t); 210 180 return; 211 181 } 212 182 } 213 183 #else 214 - static inline void debug_show_blocker(struct task_struct *task) 184 + static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout) 215 185 { 216 186 } 217 187 #endif 218 188 219 189 static void check_hung_task(struct task_struct *t, unsigned long timeout) 220 190 { 221 - unsigned long switch_count = t->nvcsw + t->nivcsw; 222 - 223 - /* 224 - * Ensure the task is not frozen. 225 - * Also, skip vfork and any other user process that freezer should skip. 226 - */ 227 - if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) 228 - return; 229 - 230 - /* 231 - * When a freshly created task is scheduled once, changes its state to 232 - * TASK_UNINTERRUPTIBLE without having ever been switched out once, it 233 - * musn't be checked. 234 - */ 235 - if (unlikely(!switch_count)) 236 - return; 237 - 238 - if (switch_count != t->last_switch_count) { 239 - t->last_switch_count = switch_count; 240 - t->last_switch_time = jiffies; 241 - return; 242 - } 243 - if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) 191 + if (!task_is_hung(t, timeout)) 244 192 return; 245 193 246 194 /* ··· 255 243 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 256 244 " disables this message.\n"); 257 245 sched_show_task(t); 258 - debug_show_blocker(t); 246 + debug_show_blocker(t, timeout); 259 247 hung_task_show_lock = true; 260 248 261 249 if (sysctl_hung_task_all_cpu_backtrace) ··· 311 299 hung_task_show_lock = false; 312 300 rcu_read_lock(); 313 301 for_each_process_thread(g, t) { 314 - unsigned int state; 315 302 316 303 if (!max_count--) 317 304 goto unlock; ··· 319 308 goto unlock; 320 309 last_break = jiffies; 321 310 } 322 - /* 323 - * skip the TASK_KILLABLE tasks -- these can be killed 324 - * skip the TASK_IDLE tasks -- those are genuinely idle 325 - */ 326 - state = READ_ONCE(t->__state); 327 - if ((state & TASK_UNINTERRUPTIBLE) && 328 - !(state & TASK_WAKEKILL) && 329 - !(state & TASK_NOLOAD)) 330 - check_hung_task(t, timeout); 311 + 312 + check_hung_task(t, timeout); 331 313 } 332 314 unlock: 333 315 rcu_read_unlock();

+1 -1

kernel/kallsyms_selftest.c

··· 264 264 char namebuf[KSYM_NAME_LEN]; 265 265 struct test_stat *stat, *stat2; 266 266 267 - stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL); 267 + stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL); 268 268 if (!stat) 269 269 return -ENOMEM; 270 270 stat2 = stat + 1;

+9

kernel/kcov.c

··· 978 978 memcpy(dst_entries, src_entries, bytes_to_move); 979 979 entries_moved = bytes_to_move >> entry_size_log; 980 980 981 + /* 982 + * A write memory barrier is required here, to ensure 983 + * that the writes from the memcpy() are visible before 984 + * the count is updated. Without this, it is possible for 985 + * a user to observe a new count value but stale 986 + * coverage data. 987 + */ 988 + smp_wmb(); 989 + 981 990 switch (mode) { 982 991 case KCOV_MODE_TRACE_PC: 983 992 WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved);

-1

kernel/kexec_core.c

··· 233 233 if (!image) 234 234 return NULL; 235 235 236 - image->head = 0; 237 236 image->entry = &image->head; 238 237 image->last_entry = &image->head; 239 238 image->control_page = ~0; /* By default this does not apply */

+1

kernel/kexec_file.c

··· 255 255 } 256 256 257 257 image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); 258 + image->force_dtb = flags & KEXEC_FILE_FORCE_DTB; 258 259 259 260 if (cmdline_len) { 260 261 image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);

+21 -1

kernel/kexec_handover.c

··· 988 988 } 989 989 990 990 /** 991 + * is_kho_boot - check if current kernel was booted via KHO-enabled 992 + * kexec 993 + * 994 + * This function checks if the current kernel was loaded through a kexec 995 + * operation with KHO enabled, by verifying that a valid KHO FDT 996 + * was passed. 997 + * 998 + * Note: This function returns reliable results only after 999 + * kho_populate() has been called during early boot. Before that, 1000 + * it may return false even if KHO data is present. 1001 + * 1002 + * Return: true if booted via KHO-enabled kexec, false otherwise 1003 + */ 1004 + bool is_kho_boot(void) 1005 + { 1006 + return !!kho_get_fdt(); 1007 + } 1008 + EXPORT_SYMBOL_GPL(is_kho_boot); 1009 + 1010 + /** 991 1011 * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. 992 1012 * @name: the name of the sub FDT passed to kho_add_subtree(). 993 1013 * @phys: if found, the physical address of the sub FDT is stored in @phys. ··· 1289 1269 int err = 0; 1290 1270 struct kexec_buf scratch; 1291 1271 1292 - if (!kho_enable) 1272 + if (!kho_out.finalized) 1293 1273 return 0; 1294 1274 1295 1275 image->kho.fdt = page_to_phys(kho_out.ser.fdt);

+103 -26

kernel/panic.c

··· 53 53 #define sysctl_oops_all_cpu_backtrace 0 54 54 #endif /* CONFIG_SMP */ 55 55 56 - int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; 56 + int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS); 57 57 static unsigned long tainted_mask = 58 58 IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; 59 59 static int pause_on_oops; ··· 67 67 static bool panic_console_replay; 68 68 69 69 bool panic_triggering_all_cpu_backtrace; 70 + static bool panic_this_cpu_backtrace_printed; 70 71 71 72 int panic_timeout = CONFIG_PANIC_TIMEOUT; 72 73 EXPORT_SYMBOL_GPL(panic_timeout); ··· 77 76 ATOMIC_NOTIFIER_HEAD(panic_notifier_list); 78 77 79 78 EXPORT_SYMBOL(panic_notifier_list); 79 + 80 + static void panic_print_deprecated(void) 81 + { 82 + pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n"); 83 + } 80 84 81 85 #ifdef CONFIG_SYSCTL 82 86 ··· 131 125 static int sysctl_panic_print_handler(const struct ctl_table *table, int write, 132 126 void *buffer, size_t *lenp, loff_t *ppos) 133 127 { 134 - pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); 128 + panic_print_deprecated(); 135 129 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 136 130 } 137 131 ··· 300 294 301 295 atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); 302 296 297 + bool panic_try_start(void) 298 + { 299 + int old_cpu, this_cpu; 300 + 301 + /* 302 + * Only one CPU is allowed to execute the crash_kexec() code as with 303 + * panic(). Otherwise parallel calls of panic() and crash_kexec() 304 + * may stop each other. To exclude them, we use panic_cpu here too. 305 + */ 306 + old_cpu = PANIC_CPU_INVALID; 307 + this_cpu = raw_smp_processor_id(); 308 + 309 + return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu); 310 + } 311 + EXPORT_SYMBOL(panic_try_start); 312 + 313 + void panic_reset(void) 314 + { 315 + atomic_set(&panic_cpu, PANIC_CPU_INVALID); 316 + } 317 + EXPORT_SYMBOL(panic_reset); 318 + 319 + bool panic_in_progress(void) 320 + { 321 + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); 322 + } 323 + EXPORT_SYMBOL(panic_in_progress); 324 + 325 + /* Return true if a panic is in progress on the current CPU. */ 326 + bool panic_on_this_cpu(void) 327 + { 328 + /* 329 + * We can use raw_smp_processor_id() here because it is impossible for 330 + * the task to be migrated to the panic_cpu, or away from it. If 331 + * panic_cpu has already been set, and we're not currently executing on 332 + * that CPU, then we never will be. 333 + */ 334 + return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); 335 + } 336 + EXPORT_SYMBOL(panic_on_this_cpu); 337 + 338 + /* 339 + * Return true if a panic is in progress on a remote CPU. 340 + * 341 + * On true, the local CPU should immediately release any printing resources 342 + * that may be needed by the panic CPU. 343 + */ 344 + bool panic_on_other_cpu(void) 345 + { 346 + return (panic_in_progress() && !panic_on_this_cpu()); 347 + } 348 + EXPORT_SYMBOL(panic_on_other_cpu); 349 + 303 350 /* 304 351 * A variant of panic() called from NMI context. We return if we've already 305 352 * panicked on this CPU. If another CPU already panicked, loop in ··· 361 302 */ 362 303 void nmi_panic(struct pt_regs *regs, const char *msg) 363 304 { 364 - int old_cpu, this_cpu; 365 - 366 - old_cpu = PANIC_CPU_INVALID; 367 - this_cpu = raw_smp_processor_id(); 368 - 369 - /* atomic_try_cmpxchg updates old_cpu on failure */ 370 - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) 305 + if (panic_try_start()) 371 306 panic("%s", msg); 372 - else if (old_cpu != this_cpu) 307 + else if (panic_on_other_cpu()) 373 308 nmi_panic_self_stop(regs); 374 309 } 375 310 EXPORT_SYMBOL(nmi_panic); ··· 381 328 origin, limit); 382 329 } 383 330 331 + static void panic_trigger_all_cpu_backtrace(void) 332 + { 333 + /* Temporary allow non-panic CPUs to write their backtraces. */ 334 + panic_triggering_all_cpu_backtrace = true; 335 + 336 + if (panic_this_cpu_backtrace_printed) 337 + trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id()); 338 + else 339 + trigger_all_cpu_backtrace(); 340 + 341 + panic_triggering_all_cpu_backtrace = false; 342 + } 343 + 384 344 /* 385 345 * Helper that triggers the NMI backtrace (if set in panic_print) 386 346 * and then performs the secondary CPUs shutdown - we cannot have ··· 401 335 */ 402 336 static void panic_other_cpus_shutdown(bool crash_kexec) 403 337 { 404 - if (panic_print & SYS_INFO_ALL_CPU_BT) { 405 - /* Temporary allow non-panic CPUs to write their backtraces. */ 406 - panic_triggering_all_cpu_backtrace = true; 407 - trigger_all_cpu_backtrace(); 408 - panic_triggering_all_cpu_backtrace = false; 409 - } 338 + if (panic_print & SYS_INFO_ALL_CPU_BT) 339 + panic_trigger_all_cpu_backtrace(); 410 340 411 341 /* 412 342 * Note that smp_send_stop() is the usual SMP shutdown function, ··· 430 368 static char buf[1024]; 431 369 long i, i_next = 0, len; 432 370 int state = 0; 433 - int old_cpu, this_cpu; 434 371 bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; 435 372 436 373 if (panic_on_warn) { ··· 466 405 * `old_cpu == this_cpu' means we came from nmi_panic() which sets 467 406 * panic_cpu to this CPU. In this case, this is also the 1st CPU. 468 407 */ 469 - old_cpu = PANIC_CPU_INVALID; 470 - this_cpu = raw_smp_processor_id(); 471 - 472 408 /* atomic_try_cmpxchg updates old_cpu on failure */ 473 - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { 409 + if (panic_try_start()) { 474 410 /* go ahead */ 475 - } else if (old_cpu != this_cpu) 411 + } else if (panic_on_other_cpu()) 476 412 panic_smp_self_stop(); 477 413 478 414 console_verbose(); ··· 480 422 buf[len - 1] = '\0'; 481 423 482 424 pr_emerg("Kernel panic - not syncing: %s\n", buf); 483 - #ifdef CONFIG_DEBUG_BUGVERBOSE 484 425 /* 485 426 * Avoid nested stack-dumping if a panic occurs during oops processing 486 427 */ 487 - if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) 428 + if (test_taint(TAINT_DIE) || oops_in_progress > 1) { 429 + panic_this_cpu_backtrace_printed = true; 430 + } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { 488 431 dump_stack(); 489 - #endif 432 + panic_this_cpu_backtrace_printed = true; 433 + } 490 434 491 435 /* 492 436 * If kgdb is enabled, give it a chance to run before we stop all ··· 997 937 #endif 998 938 999 939 core_param(panic, panic_timeout, int, 0644); 1000 - core_param(panic_print, panic_print, ulong, 0644); 1001 940 core_param(pause_on_oops, pause_on_oops, int, 0644); 1002 941 core_param(panic_on_warn, panic_on_warn, int, 0644); 1003 942 core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); 1004 943 core_param(panic_console_replay, panic_console_replay, bool, 0644); 944 + 945 + static int panic_print_set(const char *val, const struct kernel_param *kp) 946 + { 947 + panic_print_deprecated(); 948 + return param_set_ulong(val, kp); 949 + } 950 + 951 + static int panic_print_get(char *val, const struct kernel_param *kp) 952 + { 953 + panic_print_deprecated(); 954 + return param_get_ulong(val, kp); 955 + } 956 + 957 + static const struct kernel_param_ops panic_print_ops = { 958 + .set = panic_print_set, 959 + .get = panic_print_get, 960 + }; 961 + __core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644); 1005 962 1006 963 static int __init oops_setup(char *s) 1007 964 {

-1

kernel/printk/internal.h

··· 332 332 unsigned long dropped; 333 333 }; 334 334 335 - bool other_cpu_in_panic(void); 336 335 bool printk_get_next_message(struct printk_message *pmsg, u64 seq, 337 336 bool is_extended, bool may_supress); 338 337

+7 -7

kernel/printk/nbcon.c

··· 12 12 #include <linux/irqflags.h> 13 13 #include <linux/kthread.h> 14 14 #include <linux/minmax.h> 15 + #include <linux/panic.h> 15 16 #include <linux/percpu.h> 16 17 #include <linux/preempt.h> 17 18 #include <linux/slab.h> ··· 255 254 * opportunity to perform any necessary cleanup if they were 256 255 * interrupted by the panic CPU while printing. 257 256 */ 258 - if (other_cpu_in_panic() && 257 + if (panic_on_other_cpu() && 259 258 (!is_reacquire || cur->unsafe_takeover)) { 260 259 return -EPERM; 261 260 } ··· 310 309 * Event #2 implies the new context is PANIC. 311 310 * Event #3 occurs when panic() has flushed the console. 312 311 * Event #4 occurs when a non-panic CPU reacquires. 313 - * Event #5 is not possible due to the other_cpu_in_panic() check 312 + * Event #5 is not possible due to the panic_on_other_cpu() check 314 313 * in nbcon_context_try_acquire_handover(). 315 314 */ 316 315 ··· 349 348 struct nbcon_state new; 350 349 351 350 /* Note that the caller must still remove the request! */ 352 - if (other_cpu_in_panic()) 351 + if (panic_on_other_cpu()) 353 352 return -EPERM; 354 353 355 354 /* ··· 447 446 * nbcon_waiter_matches(). In particular, the assumption that 448 447 * lower priorities are ignored during panic. 449 448 */ 450 - if (other_cpu_in_panic()) 449 + if (panic_on_other_cpu()) 451 450 return -EPERM; 452 451 453 452 /* Handover is not possible on the same CPU. */ ··· 590 589 */ 591 590 static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) 592 591 { 593 - unsigned int cpu = smp_processor_id(); 594 592 struct console *con = ctxt->console; 595 593 struct nbcon_state cur; 596 594 int err; ··· 614 614 /* Acquire succeeded. */ 615 615 616 616 /* Assign the appropriate buffer for this context. */ 617 - if (atomic_read(&panic_cpu) == cpu) 617 + if (panic_on_this_cpu()) 618 618 ctxt->pbufs = &panic_nbcon_pbufs; 619 619 else 620 620 ctxt->pbufs = con->pbufs; ··· 1394 1394 { 1395 1395 unsigned int *cpu_emergency_nesting; 1396 1396 1397 - if (this_cpu_in_panic()) 1397 + if (panic_on_this_cpu()) 1398 1398 return NBCON_PRIO_PANIC; 1399 1399 1400 1400 cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();

+5 -32

kernel/printk/printk.c

··· 48 48 #include <linux/sched/clock.h> 49 49 #include <linux/sched/debug.h> 50 50 #include <linux/sched/task_stack.h> 51 + #include <linux/panic.h> 51 52 52 53 #include <linux/uaccess.h> 53 54 #include <asm/sections.h> ··· 345 344 printk_safe_exit_irqrestore(flags); 346 345 } 347 346 #define up_console_sem() __up_console_sem(_RET_IP_) 348 - 349 - static bool panic_in_progress(void) 350 - { 351 - return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); 352 - } 353 - 354 - /* Return true if a panic is in progress on the current CPU. */ 355 - bool this_cpu_in_panic(void) 356 - { 357 - /* 358 - * We can use raw_smp_processor_id() here because it is impossible for 359 - * the task to be migrated to the panic_cpu, or away from it. If 360 - * panic_cpu has already been set, and we're not currently executing on 361 - * that CPU, then we never will be. 362 - */ 363 - return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); 364 - } 365 - 366 - /* 367 - * Return true if a panic is in progress on a remote CPU. 368 - * 369 - * On true, the local CPU should immediately release any printing resources 370 - * that may be needed by the panic CPU. 371 - */ 372 - bool other_cpu_in_panic(void) 373 - { 374 - return (panic_in_progress() && !this_cpu_in_panic()); 375 - } 376 347 377 348 /* 378 349 * This is used for debugging the mess that is the VT code by ··· 2380 2407 * non-panic CPUs are generating any messages, they will be 2381 2408 * silently dropped. 2382 2409 */ 2383 - if (other_cpu_in_panic() && 2410 + if (panic_on_other_cpu() && 2384 2411 !debug_non_panic_cpus && 2385 2412 !panic_triggering_all_cpu_backtrace) 2386 2413 return 0; ··· 2816 2843 might_sleep(); 2817 2844 2818 2845 /* On panic, the console_lock must be left to the panic cpu. */ 2819 - while (other_cpu_in_panic()) 2846 + while (panic_on_other_cpu()) 2820 2847 msleep(1000); 2821 2848 2822 2849 down_console_sem(); ··· 2836 2863 int console_trylock(void) 2837 2864 { 2838 2865 /* On panic, the console_lock must be left to the panic cpu. */ 2839 - if (other_cpu_in_panic()) 2866 + if (panic_on_other_cpu()) 2840 2867 return 0; 2841 2868 if (down_trylock_console_sem()) 2842 2869 return 0; ··· 3216 3243 any_progress = true; 3217 3244 3218 3245 /* Allow panic_cpu to take over the consoles safely. */ 3219 - if (other_cpu_in_panic()) 3246 + if (panic_on_other_cpu()) 3220 3247 goto abandon; 3221 3248 3222 3249 if (do_cond_resched)

+1 -1

kernel/printk/printk_ringbuffer.c

··· 2143 2143 * But it would have the sequence number returned 2144 2144 * by "prb_next_reserve_seq() - 1". 2145 2145 */ 2146 - if (this_cpu_in_panic() && 2146 + if (panic_on_this_cpu() && 2147 2147 (!debug_non_panic_cpus || legacy_allow_panic_sync) && 2148 2148 ((*seq + 1) < prb_next_reserve_seq(rb))) { 2149 2149 (*seq)++;

+30 -2

kernel/sys.c

··· 1734 1734 struct rlimit old, new; 1735 1735 struct task_struct *tsk; 1736 1736 unsigned int checkflags = 0; 1737 + bool need_tasklist; 1737 1738 int ret; 1738 1739 1739 1740 if (old_rlim) ··· 1761 1760 get_task_struct(tsk); 1762 1761 rcu_read_unlock(); 1763 1762 1764 - ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, 1765 - old_rlim ? &old : NULL); 1763 + need_tasklist = !same_thread_group(tsk, current); 1764 + if (need_tasklist) { 1765 + /* 1766 + * Ensure we can't race with group exit or de_thread(), 1767 + * so tsk->group_leader can't be freed or changed until 1768 + * read_unlock(tasklist_lock) below. 1769 + */ 1770 + read_lock(&tasklist_lock); 1771 + if (!pid_alive(tsk)) 1772 + ret = -ESRCH; 1773 + } 1774 + 1775 + if (!ret) { 1776 + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, 1777 + old_rlim ? &old : NULL); 1778 + } 1779 + 1780 + if (need_tasklist) 1781 + read_unlock(&tasklist_lock); 1766 1782 1767 1783 if (!ret && old_rlim) { 1768 1784 rlim_to_rlim64(&old, &old64); ··· 2533 2515 error = -EINVAL; 2534 2516 break; 2535 2517 } 2518 + /* 2519 + * Ensure that either: 2520 + * 2521 + * 1. Subsequent getppid() calls reflect the parent process having died. 2522 + * 2. forget_original_parent() will send the new me->pdeath_signal. 2523 + * 2524 + * Also prevent the read of me->pdeath_signal from being a data race. 2525 + */ 2526 + read_lock(&tasklist_lock); 2536 2527 me->pdeath_signal = arg2; 2528 + read_unlock(&tasklist_lock); 2537 2529 break; 2538 2530 case PR_GET_PDEATHSIG: 2539 2531 error = put_user(me->pdeath_signal, (int __user *)arg2);

+23 -5

kernel/watchdog.c

··· 425 425 */ 426 426 static u16 get_16bit_precision(u64 data_ns) 427 427 { 428 - return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */ 428 + /* 429 + * 2^24ns ~= 16.8ms 430 + * Round to the nearest multiple of 16.8 milliseconds. 431 + */ 432 + return (data_ns + (1 << 23)) >> 24LL; 429 433 } 430 434 431 435 static void update_cpustat(void) ··· 448 444 old_stat = __this_cpu_read(cpustat_old[i]); 449 445 new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); 450 446 util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); 447 + /* 448 + * Since we use 16-bit precision, the raw data will undergo 449 + * integer division, which may sometimes result in data loss, 450 + * and then result might exceed 100%. To avoid confusion, 451 + * we enforce a 100% display cap when calculations exceed this threshold. 452 + */ 453 + if (util > 100) 454 + util = 100; 451 455 __this_cpu_write(cpustat_util[tail][i], util); 452 456 __this_cpu_write(cpustat_old[i], new_stat); 453 457 } ··· 467 455 { 468 456 int i, group; 469 457 u8 tail = __this_cpu_read(cpustat_tail); 470 - u64 sample_period_second = sample_period; 458 + u64 sample_period_msecond = sample_period; 471 459 472 - do_div(sample_period_second, NSEC_PER_SEC); 460 + do_div(sample_period_msecond, NSEC_PER_MSEC); 473 461 474 462 /* 475 463 * Outputting the "watchdog" prefix on every line is redundant and not 476 464 * concise, and the original alarm information is sufficient for 477 465 * positioning in logs, hence here printk() is used instead of pr_crit(). 478 466 */ 479 - printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n", 480 - smp_processor_id(), sample_period_second); 467 + printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n", 468 + smp_processor_id(), sample_period_msecond); 481 469 482 470 for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { 483 471 group = (tail + i) % NUM_SAMPLE_PERIODS; ··· 750 738 unsigned long flags; 751 739 752 740 if (!watchdog_enabled) 741 + return HRTIMER_NORESTART; 742 + 743 + /* 744 + * pass the buddy check if a panic is in process 745 + */ 746 + if (panic_in_progress()) 753 747 return HRTIMER_NORESTART; 754 748 755 749 watchdog_hardlockup_kick();

+4

kernel/watchdog_perf.c

··· 12 12 13 13 #define pr_fmt(fmt) "NMI watchdog: " fmt 14 14 15 + #include <linux/panic.h> 15 16 #include <linux/nmi.h> 16 17 #include <linux/atomic.h> 17 18 #include <linux/module.h> ··· 108 107 { 109 108 /* Ensure the watchdog never gets throttled */ 110 109 event->hw.interrupts = 0; 110 + 111 + if (panic_in_progress()) 112 + return; 111 113 112 114 if (!watchdog_check_timestamp()) 113 115 return;

-6

lib/Kconfig.debug

··· 1067 1067 1068 1068 Say N if unsure. 1069 1069 1070 - config PANIC_ON_OOPS_VALUE 1071 - int 1072 - range 0 1 1073 - default 0 if !PANIC_ON_OOPS 1074 - default 1 if PANIC_ON_OOPS 1075 - 1076 1070 config PANIC_TIMEOUT 1077 1071 int "panic timeout" 1078 1072 default 0

+2 -1

lib/alloc_tag.c

··· 9 9 #include <linux/proc_fs.h> 10 10 #include <linux/seq_buf.h> 11 11 #include <linux/seq_file.h> 12 + #include <linux/string_choices.h> 12 13 #include <linux/vmalloc.h> 13 14 #include <linux/kmemleak.h> 14 15 ··· 729 728 } 730 729 mem_profiling_support = true; 731 730 pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n", 732 - compressed ? "with" : "without", enable ? "on" : "off"); 731 + compressed ? "with" : "without", str_on_off(enable)); 733 732 } 734 733 735 734 if (enable != mem_alloc_profiling_enabled()) {

+2 -2

lib/btree.c

··· 653 653 * walks to remove a single object from the victim. 654 654 */ 655 655 for (;;) { 656 - if (!btree_last(victim, geo, key)) 656 + val = btree_last(victim, geo, key); 657 + if (!val) 657 658 break; 658 - val = btree_lookup(victim, geo, key); 659 659 err = btree_insert(target, geo, key, val, gfp); 660 660 if (err) 661 661 return err;

+10 -11

lib/decompress.c

··· 49 49 }; 50 50 51 51 static const struct compress_format compressed_formats[] __initconst = { 52 - { {0x1f, 0x8b}, "gzip", gunzip }, 53 - { {0x1f, 0x9e}, "gzip", gunzip }, 54 - { {0x42, 0x5a}, "bzip2", bunzip2 }, 55 - { {0x5d, 0x00}, "lzma", unlzma }, 56 - { {0xfd, 0x37}, "xz", unxz }, 57 - { {0x89, 0x4c}, "lzo", unlzo }, 58 - { {0x02, 0x21}, "lz4", unlz4 }, 59 - { {0x28, 0xb5}, "zstd", unzstd }, 60 - { {0, 0}, NULL, NULL } 52 + { .magic = {0x1f, 0x8b}, .name = "gzip", .decompressor = gunzip }, 53 + { .magic = {0x1f, 0x9e}, .name = "gzip", .decompressor = gunzip }, 54 + { .magic = {0x42, 0x5a}, .name = "bzip2", .decompressor = bunzip2 }, 55 + { .magic = {0x5d, 0x00}, .name = "lzma", .decompressor = unlzma }, 56 + { .magic = {0xfd, 0x37}, .name = "xz", .decompressor = unxz }, 57 + { .magic = {0x89, 0x4c}, .name = "lzo", .decompressor = unlzo }, 58 + { .magic = {0x02, 0x21}, .name = "lz4", .decompressor = unlz4 }, 59 + { .magic = {0x28, 0xb5}, .name = "zstd", .decompressor = unzstd }, 60 + { /* sentinel */ } 61 61 }; 62 62 63 63 decompress_fn __init decompress_method(const unsigned char *inbuf, long len, ··· 73 73 74 74 pr_debug("Compressed data magic: %#.2x %#.2x\n", inbuf[0], inbuf[1]); 75 75 76 - for (cf = compressed_formats; cf->name; cf++) { 76 + for (cf = compressed_formats; cf->name; cf++) 77 77 if (!memcmp(inbuf, cf->magic, 2)) 78 78 break; 79 79 80 - } 81 80 if (name) 82 81 *name = cf->name; 83 82 return cf->decompressor;

-1

lib/digsig.c

··· 159 159 160 160 len = mlen; 161 161 head = len - l; 162 - memset(out1, 0, head); 163 162 memcpy(out1 + head, p, l); 164 163 165 164 kfree(p);

+1 -1

lib/dump_stack.c

··· 102 102 */ 103 103 asmlinkage __visible void dump_stack_lvl(const char *log_lvl) 104 104 { 105 - bool in_panic = this_cpu_in_panic(); 105 + bool in_panic = panic_on_this_cpu(); 106 106 unsigned long flags; 107 107 108 108 /*

+1 -3

lib/fault-inject-usercopy.c

··· 22 22 23 23 dir = fault_create_debugfs_attr("fail_usercopy", NULL, 24 24 &fail_usercopy.attr); 25 - if (IS_ERR(dir)) 26 - return PTR_ERR(dir); 27 25 28 - return 0; 26 + return PTR_ERR_OR_ZERO(dir); 29 27 } 30 28 31 29 late_initcall(fail_usercopy_debugfs);

+4 -1

lib/genalloc.c

··· 899 899 if (!name) 900 900 name = of_node_full_name(np_pool); 901 901 } 902 - if (pdev) 902 + if (pdev) { 903 903 pool = gen_pool_get(&pdev->dev, name); 904 + put_device(&pdev->dev); 905 + } 906 + 904 907 of_node_put(np_pool); 905 908 906 909 return pool;

+3 -3

lib/ref_tracker.c

··· 75 75 struct ref_tracker *tracker; 76 76 77 77 stats = kmalloc(struct_size(stats, stacks, limit), 78 - GFP_NOWAIT | __GFP_NOWARN); 78 + GFP_NOWAIT); 79 79 if (!stats) 80 80 return ERR_PTR(-ENOMEM); 81 81 stats->total = 0; ··· 159 159 return; 160 160 } 161 161 162 - sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN); 162 + sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT); 163 163 164 164 for (i = 0, skipped = stats->total; i < stats->count; ++i) { 165 165 stack = stats->stacks[i].stack_handle; ··· 306 306 } 307 307 nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1); 308 308 stack_handle = stack_depot_save(entries, nr_entries, 309 - GFP_NOWAIT | __GFP_NOWARN); 309 + GFP_NOWAIT); 310 310 311 311 spin_lock_irqsave(&dir->lock, flags); 312 312 if (tracker->dead) {

+2 -1

lib/sys_info.c

··· 55 55 void *buffer, size_t *lenp, 56 56 loff_t *ppos) 57 57 { 58 - char names[sizeof(sys_info_avail) + 1]; 58 + char names[sizeof(sys_info_avail)]; 59 59 struct ctl_table table; 60 60 unsigned long *si_bits_global; 61 61 ··· 81 81 char *delim = ""; 82 82 int i, len = 0; 83 83 84 + names[0] = '\0'; 84 85 for (i = 0; i < ARRAY_SIZE(si_names); i++) { 85 86 if (*si_bits_global & si_names[i].bit) { 86 87 len += scnprintf(names + len, sizeof(names) - len,

+4 -3

lib/test_firmware.c

··· 26 26 #include <linux/kthread.h> 27 27 #include <linux/vmalloc.h> 28 28 #include <linux/efi_embedded_fw.h> 29 + #include <linux/string_choices.h> 29 30 30 31 MODULE_IMPORT_NS("TEST_FIRMWARE"); 31 32 ··· 305 304 "FW_ACTION_NOUEVENT"); 306 305 len += scnprintf(buf + len, PAGE_SIZE - len, 307 306 "into_buf:\t\t%s\n", 308 - test_fw_config->into_buf ? "true" : "false"); 307 + str_true_false(test_fw_config->into_buf)); 309 308 len += scnprintf(buf + len, PAGE_SIZE - len, 310 309 "buf_size:\t%zu\n", test_fw_config->buf_size); 311 310 len += scnprintf(buf + len, PAGE_SIZE - len, 312 311 "file_offset:\t%zu\n", test_fw_config->file_offset); 313 312 len += scnprintf(buf + len, PAGE_SIZE - len, 314 313 "partial:\t\t%s\n", 315 - test_fw_config->partial ? "true" : "false"); 314 + str_true_false(test_fw_config->partial)); 316 315 len += scnprintf(buf + len, PAGE_SIZE - len, 317 316 "sync_direct:\t\t%s\n", 318 - test_fw_config->sync_direct ? "true" : "false"); 317 + str_true_false(test_fw_config->sync_direct)); 319 318 len += scnprintf(buf + len, PAGE_SIZE - len, 320 319 "read_fw_idx:\t%u\n", test_fw_config->read_fw_idx); 321 320 if (test_fw_config->upload_name)

+9 -5

scripts/checkpatch.pl

··· 2636 2636 $realfile =~ m@/bpf/.*\.bpf\.c$@; 2637 2637 } 2638 2638 2639 + sub is_userspace { 2640 + my ($realfile) = @_; 2641 + return ($realfile =~ m@^tools/@ || $realfile =~ m@^scripts/@); 2642 + } 2643 + 2639 2644 sub process { 2640 2645 my $filename = shift; 2641 2646 ··· 3299 3294 # file delta changes 3300 3295 $line =~ /^\s*(?:[\w\.\-\+]*\/)++[\w\.\-\+]+:/ || 3301 3296 # filename then : 3302 - $line =~ /^\s*(?:Fixes:|$link_tags_search|$signature_tags)/i || 3297 + $line =~ /^\s*(?:Fixes:|https?:|$link_tags_search|$signature_tags)/i || 3303 3298 # A Fixes:, link or signature tag line 3304 3299 $commit_log_possible_stack_dump)) { 3305 3300 WARN("COMMIT_LOG_LONG_LINE", ··· 7023 7018 # } 7024 7019 # } 7025 7020 # } 7026 - 7027 7021 # strcpy uses that should likely be strscpy 7028 - if ($line =~ /\bstrcpy\s*\(/) { 7022 + if ($line =~ /\bstrcpy\s*\(/ && !is_userspace($realfile)) { 7029 7023 WARN("STRCPY", 7030 7024 "Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88\n" . $herecurr); 7031 7025 } 7032 7026 7033 7027 # strlcpy uses that should likely be strscpy 7034 - if ($line =~ /\bstrlcpy\s*\(/) { 7028 + if ($line =~ /\bstrlcpy\s*\(/ && !is_userspace($realfile)) { 7035 7029 WARN("STRLCPY", 7036 7030 "Prefer strscpy over strlcpy - see: https://github.com/KSPP/linux/issues/89\n" . $herecurr); 7037 7031 } 7038 7032 7039 7033 # strncpy uses that should likely be strscpy or strscpy_pad 7040 - if ($line =~ /\bstrncpy\s*\(/) { 7034 + if ($line =~ /\bstrncpy\s*\(/ && !is_userspace($realfile)) { 7041 7035 WARN("STRNCPY", 7042 7036 "Prefer strscpy, strscpy_pad, or __nonstring over strncpy - see: https://github.com/KSPP/linux/issues/90\n" . $herecurr); 7043 7037 }

+9

scripts/coccinelle/api/platform_no_drv_owner.cocci

··· 10 10 virtual report 11 11 12 12 @match1@ 13 + declarer name builtin_i2c_driver; 14 + declarer name builtin_platform_driver; 15 + declarer name builtin_platform_driver_probe; 13 16 declarer name module_i2c_driver; 14 17 declarer name module_platform_driver; 15 18 declarer name module_platform_driver_probe; 16 19 identifier __driver; 17 20 @@ 18 21 ( 22 + builtin_i2c_driver(__driver); 23 + | 24 + builtin_platform_driver(__driver); 25 + | 26 + builtin_platform_driver_probe(__driver, ...); 27 + | 19 28 module_i2c_driver(__driver); 20 29 | 21 30 module_platform_driver(__driver);

+7 -7

scripts/coccinelle/misc/of_table.cocci

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 - /// Make sure (of/i2c/platform)_device_id tables are NULL terminated 2 + /// Make sure (of/i2c/platform/spi)_device_id tables are NULL terminated 3 3 // 4 4 // Keywords: of_table i2c_table platform_table 5 5 // Confidence: Medium ··· 15 15 expression E; 16 16 @@ 17 17 ( 18 - struct $of_device_id \| i2c_device_id \| platform_device_id$ arr[] = { 18 + struct $of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id$ arr[] = { 19 19 ..., 20 20 { 21 21 .var = E, 22 22 * } 23 23 }; 24 24 | 25 - struct $of_device_id \| i2c_device_id \| platform_device_id$ arr[] = { 25 + struct $of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id$ arr[] = { 26 26 ..., 27 27 * { ..., E, ... }, 28 28 }; ··· 33 33 expression E; 34 34 @@ 35 35 ( 36 - struct $of_device_id \| i2c_device_id \| platform_device_id$ arr[] = { 36 + struct $of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id$ arr[] = { 37 37 ..., 38 38 { 39 39 .var = E, ··· 42 42 + { } 43 43 }; 44 44 | 45 - struct $of_device_id \| i2c_device_id \| platform_device_id$ arr[] = { 45 + struct $of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id$ arr[] = { 46 46 ..., 47 47 { ..., E, ... }, 48 48 + { }, ··· 55 55 expression E; 56 56 @@ 57 57 ( 58 - struct $of_device_id \| i2c_device_id \| platform_device_id$ arr[] = { 58 + struct $of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id$ arr[] = { 59 59 ..., 60 60 { 61 61 .var = E, ··· 63 63 @p1 64 64 }; 65 65 | 66 - struct $of_device_id \| i2c_device_id \| platform_device_id$ arr[] = { 66 + struct $of_device_id \| i2c_device_id \| platform_device_id \| spi_device_id$ arr[] = { 67 67 ..., 68 68 { ..., E, ... } 69 69 @p1

+426 -143

tools/accounting/delaytop.c

··· 42 42 #include <linux/genetlink.h> 43 43 #include <linux/taskstats.h> 44 44 #include <linux/cgroupstats.h> 45 + #include <stddef.h> 45 46 46 - #define PSI_CPU_SOME "/proc/pressure/cpu" 47 - #define PSI_CPU_FULL "/proc/pressure/cpu" 48 - #define PSI_MEMORY_SOME "/proc/pressure/memory" 49 - #define PSI_MEMORY_FULL "/proc/pressure/memory" 50 - #define PSI_IO_SOME "/proc/pressure/io" 51 - #define PSI_IO_FULL "/proc/pressure/io" 52 - #define PSI_IRQ_FULL "/proc/pressure/irq" 47 + #define PSI_PATH "/proc/pressure" 48 + #define PSI_CPU_PATH "/proc/pressure/cpu" 49 + #define PSI_MEMORY_PATH "/proc/pressure/memory" 50 + #define PSI_IO_PATH "/proc/pressure/io" 51 + #define PSI_IRQ_PATH "/proc/pressure/irq" 53 52 54 53 #define NLA_NEXT(na) ((struct nlattr *)((char *)(na) + NLA_ALIGN((na)->nla_len))) 55 54 #define NLA_DATA(na) ((void *)((char *)(na) + NLA_HDRLEN)) ··· 60 61 #define TASK_COMM_LEN 16 61 62 #define MAX_MSG_SIZE 1024 62 63 #define MAX_TASKS 1000 64 + #define MAX_BUF_LEN 256 63 65 #define SET_TASK_STAT(task_count, field) tasks[task_count].field = stats.field 64 66 #define BOOL_FPRINT(stream, fmt, ...) \ 65 67 ({ \ 66 68 int ret = fprintf(stream, fmt, ##__VA_ARGS__); \ 67 69 ret >= 0; \ 68 70 }) 71 + #define TASK_AVG(task, field) average_ms((task).field##_delay_total, (task).field##_count) 69 72 #define PSI_LINE_FORMAT "%-12s %6.1f%%/%6.1f%%/%6.1f%%/%8llu(ms)\n" 73 + #define DELAY_FMT_DEFAULT "%8.2f %8.2f %8.2f %8.2f\n" 74 + #define DELAY_FMT_MEMVERBOSE "%8.2f %8.2f %8.2f %8.2f %8.2f %8.2f\n" 75 + #define SORT_FIELD(name, cmd, modes) \ 76 + {#name, #cmd, \ 77 + offsetof(struct task_info, name##_delay_total), \ 78 + offsetof(struct task_info, name##_count), \ 79 + modes} 80 + #define END_FIELD {NULL, 0, 0} 70 81 71 - /* Program settings structure */ 72 - struct config { 73 - int delay; /* Update interval in seconds */ 74 - int iterations; /* Number of iterations, 0 == infinite */ 75 - int max_processes; /* Maximum number of processes to show */ 76 - char sort_field; /* Field to sort by */ 77 - int output_one_time; /* Output once and exit */ 78 - int monitor_pid; /* Monitor specific PID */ 79 - char *container_path; /* Path to container cgroup */ 80 - }; 82 + /* Display mode types */ 83 + #define MODE_TYPE_ALL (0xFFFFFFFF) 84 + #define MODE_DEFAULT (1 << 0) 85 + #define MODE_MEMVERBOSE (1 << 1) 81 86 82 87 /* PSI statistics structure */ 83 88 struct psi_stats { ··· 122 119 unsigned long long wpcopy_delay_total; 123 120 unsigned long long irq_count; 124 121 unsigned long long irq_delay_total; 122 + unsigned long long mem_count; 123 + unsigned long long mem_delay_total; 125 124 }; 126 125 127 126 /* Container statistics structure */ ··· 135 130 int nr_io_wait; /* Number of processes in IO wait */ 136 131 }; 137 132 133 + /* Delay field structure */ 134 + struct field_desc { 135 + const char *name; /* Field name for cmdline argument */ 136 + const char *cmd_char; /* Interactive command */ 137 + unsigned long total_offset; /* Offset of total delay in task_info */ 138 + unsigned long count_offset; /* Offset of count in task_info */ 139 + size_t supported_modes; /* Supported display modes */ 140 + }; 141 + 142 + /* Program settings structure */ 143 + struct config { 144 + int delay; /* Update interval in seconds */ 145 + int iterations; /* Number of iterations, 0 == infinite */ 146 + int max_processes; /* Maximum number of processes to show */ 147 + int output_one_time; /* Output once and exit */ 148 + int monitor_pid; /* Monitor specific PID */ 149 + char *container_path; /* Path to container cgroup */ 150 + const struct field_desc *sort_field; /* Current sort field */ 151 + size_t display_mode; /* Current display mode */ 152 + }; 153 + 138 154 /* Global variables */ 139 155 static struct config cfg; 140 156 static struct psi_stats psi; ··· 163 137 static int task_count; 164 138 static int running = 1; 165 139 static struct container_stats container_stats; 140 + static const struct field_desc sort_fields[] = { 141 + SORT_FIELD(cpu, c, MODE_DEFAULT), 142 + SORT_FIELD(blkio, i, MODE_DEFAULT), 143 + SORT_FIELD(irq, q, MODE_DEFAULT), 144 + SORT_FIELD(mem, m, MODE_DEFAULT | MODE_MEMVERBOSE), 145 + SORT_FIELD(swapin, s, MODE_MEMVERBOSE), 146 + SORT_FIELD(freepages, r, MODE_MEMVERBOSE), 147 + SORT_FIELD(thrashing, t, MODE_MEMVERBOSE), 148 + SORT_FIELD(compact, p, MODE_MEMVERBOSE), 149 + SORT_FIELD(wpcopy, w, MODE_MEMVERBOSE), 150 + END_FIELD 151 + }; 152 + static int sort_selected; 166 153 167 154 /* Netlink socket variables */ 168 155 static int nl_sd = -1; ··· 197 158 tcsetattr(STDIN_FILENO, TCSAFLUSH, &orig_termios); 198 159 } 199 160 161 + /* Find field descriptor by command line */ 162 + static const struct field_desc *get_field_by_cmd_char(char ch) 163 + { 164 + const struct field_desc *field; 165 + 166 + for (field = sort_fields; field->name != NULL; field++) { 167 + if (field->cmd_char[0] == ch) 168 + return field; 169 + } 170 + 171 + return NULL; 172 + } 173 + 174 + /* Find field descriptor by name with string comparison */ 175 + static const struct field_desc *get_field_by_name(const char *name) 176 + { 177 + const struct field_desc *field; 178 + size_t field_len; 179 + 180 + for (field = sort_fields; field->name != NULL; field++) { 181 + field_len = strlen(field->name); 182 + if (field_len != strlen(name)) 183 + continue; 184 + if (strncmp(field->name, name, field_len) == 0) 185 + return field; 186 + } 187 + 188 + return NULL; 189 + } 190 + 191 + /* Find display name for a field descriptor */ 192 + static const char *get_name_by_field(const struct field_desc *field) 193 + { 194 + return field ? field->name : "UNKNOWN"; 195 + } 196 + 197 + /* Generate string of available field names */ 198 + static void display_available_fields(size_t mode) 199 + { 200 + const struct field_desc *field; 201 + char buf[MAX_BUF_LEN]; 202 + 203 + buf[0] = '\0'; 204 + 205 + for (field = sort_fields; field->name != NULL; field++) { 206 + if (!(field->supported_modes & mode)) 207 + continue; 208 + strncat(buf, "|", MAX_BUF_LEN - strlen(buf) - 1); 209 + strncat(buf, field->name, MAX_BUF_LEN - strlen(buf) - 1); 210 + buf[MAX_BUF_LEN - 1] = '\0'; 211 + } 212 + 213 + fprintf(stderr, "Available fields: %s\n", buf); 214 + } 215 + 200 216 /* Display usage information and command line options */ 201 217 static void usage(void) 202 218 { 203 219 printf("Usage: delaytop [Options]\n" 204 220 "Options:\n" 205 - " -h, --help Show this help message and exit\n" 206 - " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" 207 - " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" 208 - " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" 209 - " -o, --once Display once and exit\n" 210 - " -p, --pid=PID Monitor only the specified PID\n" 211 - " -C, --container=PATH Monitor the container at specified cgroup path\n"); 221 + " -h, --help Show this help message and exit\n" 222 + " -d, --delay=SECONDS Set refresh interval (default: 2 seconds, min: 1)\n" 223 + " -n, --iterations=COUNT Set number of updates (default: 0 = infinite)\n" 224 + " -P, --processes=NUMBER Set maximum number of processes to show (default: 20, max: 1000)\n" 225 + " -o, --once Display once and exit\n" 226 + " -p, --pid=PID Monitor only the specified PID\n" 227 + " -C, --container=PATH Monitor the container at specified cgroup path\n" 228 + " -s, --sort=FIELD Sort by delay field (default: cpu)\n" 229 + " -M, --memverbose Display memory detailed information\n"); 212 230 exit(0); 213 231 } 214 232 ··· 273 177 static void parse_args(int argc, char **argv) 274 178 { 275 179 int c; 180 + const struct field_desc *field; 276 181 struct option long_options[] = { 277 182 {"help", no_argument, 0, 'h'}, 278 183 {"delay", required_argument, 0, 'd'}, ··· 281 184 {"pid", required_argument, 0, 'p'}, 282 185 {"once", no_argument, 0, 'o'}, 283 186 {"processes", required_argument, 0, 'P'}, 187 + {"sort", required_argument, 0, 's'}, 284 188 {"container", required_argument, 0, 'C'}, 189 + {"memverbose", no_argument, 0, 'M'}, 285 190 {0, 0, 0, 0} 286 191 }; 287 192 ··· 291 192 cfg.delay = 2; 292 193 cfg.iterations = 0; 293 194 cfg.max_processes = 20; 294 - cfg.sort_field = 'c'; /* Default sort by CPU delay */ 195 + cfg.sort_field = &sort_fields[0]; /* Default sorted by CPU delay */ 295 196 cfg.output_one_time = 0; 296 197 cfg.monitor_pid = 0; /* 0 means monitor all PIDs */ 297 198 cfg.container_path = NULL; 199 + cfg.display_mode = MODE_DEFAULT; 298 200 299 201 while (1) { 300 202 int option_index = 0; 301 203 302 - c = getopt_long(argc, argv, "hd:n:p:oP:C:", long_options, &option_index); 204 + c = getopt_long(argc, argv, "hd:n:p:oP:C:s:M", long_options, &option_index); 303 205 if (c == -1) 304 206 break; 305 207 ··· 347 247 case 'C': 348 248 cfg.container_path = strdup(optarg); 349 249 break; 250 + case 's': 251 + if (strlen(optarg) == 0) { 252 + fprintf(stderr, "Error: empty sort field\n"); 253 + exit(1); 254 + } 255 + 256 + field = get_field_by_name(optarg); 257 + /* Show available fields if invalid option provided */ 258 + if (!field) { 259 + fprintf(stderr, "Error: invalid sort field '%s'\n", optarg); 260 + display_available_fields(MODE_TYPE_ALL); 261 + exit(1); 262 + } 263 + 264 + cfg.sort_field = field; 265 + break; 266 + case 'M': 267 + cfg.display_mode = MODE_MEMVERBOSE; 268 + cfg.sort_field = get_field_by_name("mem"); 269 + break; 350 270 default: 351 271 fprintf(stderr, "Try 'delaytop --help' for more information.\n"); 352 272 exit(1); 353 273 } 354 274 } 275 + } 276 + 277 + /* Calculate average delay in milliseconds for overall memory */ 278 + static void set_mem_delay_total(struct task_info *t) 279 + { 280 + t->mem_delay_total = t->swapin_delay_total + 281 + t->freepages_delay_total + 282 + t->thrashing_delay_total + 283 + t->compact_delay_total + 284 + t->wpcopy_delay_total; 285 + } 286 + 287 + static void set_mem_count(struct task_info *t) 288 + { 289 + t->mem_count = t->swapin_count + 290 + t->freepages_count + 291 + t->thrashing_count + 292 + t->compact_count + 293 + t->wpcopy_count; 355 294 } 356 295 357 296 /* Create a raw netlink socket and bind */ ··· 497 358 return id; 498 359 } 499 360 500 - static void read_psi_stats(void) 361 + static int read_psi_stats(void) 501 362 { 502 363 FILE *fp; 503 364 char line[256]; 504 365 int ret = 0; 366 + int error_count = 0; 367 + 368 + /* Check if PSI path exists */ 369 + if (access(PSI_PATH, F_OK) != 0) { 370 + fprintf(stderr, "Error: PSI interface not found at %s\n", PSI_PATH); 371 + fprintf(stderr, "Please ensure your kernel supports PSI (Pressure Stall Information)\n"); 372 + return -1; 373 + } 374 + 505 375 /* Zero all fields */ 506 376 memset(&psi, 0, sizeof(psi)); 377 + 507 378 /* CPU pressure */ 508 - fp = fopen(PSI_CPU_SOME, "r"); 379 + fp = fopen(PSI_CPU_PATH, "r"); 509 380 if (fp) { 510 381 while (fgets(line, sizeof(line), fp)) { 511 382 if (strncmp(line, "some", 4) == 0) { 512 383 ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", 513 384 &psi.cpu_some_avg10, &psi.cpu_some_avg60, 514 385 &psi.cpu_some_avg300, &psi.cpu_some_total); 515 - if (ret != 4) 386 + if (ret != 4) { 516 387 fprintf(stderr, "Failed to parse CPU some PSI data\n"); 388 + error_count++; 389 + } 517 390 } else if (strncmp(line, "full", 4) == 0) { 518 391 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", 519 392 &psi.cpu_full_avg10, &psi.cpu_full_avg60, 520 393 &psi.cpu_full_avg300, &psi.cpu_full_total); 521 - if (ret != 4) 394 + if (ret != 4) { 522 395 fprintf(stderr, "Failed to parse CPU full PSI data\n"); 396 + error_count++; 397 + } 523 398 } 524 399 } 525 400 fclose(fp); 401 + } else { 402 + fprintf(stderr, "Warning: Failed to open %s\n", PSI_CPU_PATH); 403 + error_count++; 526 404 } 405 + 527 406 /* Memory pressure */ 528 - fp = fopen(PSI_MEMORY_SOME, "r"); 407 + fp = fopen(PSI_MEMORY_PATH, "r"); 529 408 if (fp) { 530 409 while (fgets(line, sizeof(line), fp)) { 531 410 if (strncmp(line, "some", 4) == 0) { 532 411 ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", 533 412 &psi.memory_some_avg10, &psi.memory_some_avg60, 534 413 &psi.memory_some_avg300, &psi.memory_some_total); 535 - if (ret != 4) 414 + if (ret != 4) { 536 415 fprintf(stderr, "Failed to parse Memory some PSI data\n"); 416 + error_count++; 417 + } 537 418 } else if (strncmp(line, "full", 4) == 0) { 538 419 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", 539 420 &psi.memory_full_avg10, &psi.memory_full_avg60, 540 421 &psi.memory_full_avg300, &psi.memory_full_total); 541 - } 542 - if (ret != 4) 422 + if (ret != 4) { 543 423 fprintf(stderr, "Failed to parse Memory full PSI data\n"); 424 + error_count++; 425 + } 426 + } 544 427 } 545 428 fclose(fp); 429 + } else { 430 + fprintf(stderr, "Warning: Failed to open %s\n", PSI_MEMORY_PATH); 431 + error_count++; 546 432 } 433 + 547 434 /* IO pressure */ 548 - fp = fopen(PSI_IO_SOME, "r"); 435 + fp = fopen(PSI_IO_PATH, "r"); 549 436 if (fp) { 550 437 while (fgets(line, sizeof(line), fp)) { 551 438 if (strncmp(line, "some", 4) == 0) { 552 439 ret = sscanf(line, "some avg10=%lf avg60=%lf avg300=%lf total=%llu", 553 440 &psi.io_some_avg10, &psi.io_some_avg60, 554 441 &psi.io_some_avg300, &psi.io_some_total); 555 - if (ret != 4) 442 + if (ret != 4) { 556 443 fprintf(stderr, "Failed to parse IO some PSI data\n"); 444 + error_count++; 445 + } 557 446 } else if (strncmp(line, "full", 4) == 0) { 558 447 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", 559 448 &psi.io_full_avg10, &psi.io_full_avg60, 560 449 &psi.io_full_avg300, &psi.io_full_total); 561 - if (ret != 4) 450 + if (ret != 4) { 562 451 fprintf(stderr, "Failed to parse IO full PSI data\n"); 452 + error_count++; 453 + } 563 454 } 564 455 } 565 456 fclose(fp); 457 + } else { 458 + fprintf(stderr, "Warning: Failed to open %s\n", PSI_IO_PATH); 459 + error_count++; 566 460 } 461 + 567 462 /* IRQ pressure (only full) */ 568 - fp = fopen(PSI_IRQ_FULL, "r"); 463 + fp = fopen(PSI_IRQ_PATH, "r"); 569 464 if (fp) { 570 465 while (fgets(line, sizeof(line), fp)) { 571 466 if (strncmp(line, "full", 4) == 0) { 572 467 ret = sscanf(line, "full avg10=%lf avg60=%lf avg300=%lf total=%llu", 573 468 &psi.irq_full_avg10, &psi.irq_full_avg60, 574 469 &psi.irq_full_avg300, &psi.irq_full_total); 575 - if (ret != 4) 470 + if (ret != 4) { 576 471 fprintf(stderr, "Failed to parse IRQ full PSI data\n"); 472 + error_count++; 473 + } 577 474 } 578 475 } 579 476 fclose(fp); 477 + } else { 478 + fprintf(stderr, "Warning: Failed to open %s\n", PSI_IRQ_PATH); 479 + error_count++; 580 480 } 481 + 482 + /* Return error count: 0 means success, >0 means warnings, -1 means fatal error */ 483 + if (error_count > 0) { 484 + fprintf(stderr, "PSI stats reading completed with %d warnings\n", error_count); 485 + return error_count; 486 + } 487 + 488 + return 0; 581 489 } 582 490 583 491 static int read_comm(int pid, char *comm_buf, size_t buf_size) ··· 713 527 SET_TASK_STAT(task_count, wpcopy_delay_total); 714 528 SET_TASK_STAT(task_count, irq_count); 715 529 SET_TASK_STAT(task_count, irq_delay_total); 530 + set_mem_count(&tasks[task_count]); 531 + set_mem_delay_total(&tasks[task_count]); 716 532 task_count++; 717 533 } 718 534 break; ··· 775 587 { 776 588 const struct task_info *t1 = (const struct task_info *)a; 777 589 const struct task_info *t2 = (const struct task_info *)b; 590 + unsigned long long total1; 591 + unsigned long long total2; 592 + unsigned long count1; 593 + unsigned long count2; 778 594 double avg1, avg2; 779 595 780 - switch (cfg.sort_field) { 781 - case 'c': /* CPU */ 782 - avg1 = average_ms(t1->cpu_delay_total, t1->cpu_count); 783 - avg2 = average_ms(t2->cpu_delay_total, t2->cpu_count); 784 - if (avg1 != avg2) 785 - return avg2 > avg1 ? 1 : -1; 786 - return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; 596 + total1 = *(unsigned long long *)((char *)t1 + cfg.sort_field->total_offset); 597 + total2 = *(unsigned long long *)((char *)t2 + cfg.sort_field->total_offset); 598 + count1 = *(unsigned long *)((char *)t1 + cfg.sort_field->count_offset); 599 + count2 = *(unsigned long *)((char *)t2 + cfg.sort_field->count_offset); 787 600 788 - default: 789 - return t2->cpu_delay_total > t1->cpu_delay_total ? 1 : -1; 790 - } 601 + avg1 = average_ms(total1, count1); 602 + avg2 = average_ms(total2, count2); 603 + if (avg1 != avg2) 604 + return avg2 > avg1 ? 1 : -1; 605 + 606 + return 0; 791 607 } 792 608 793 609 /* Sort tasks by selected field */ ··· 865 673 } 866 674 867 675 /* Display results to stdout or log file */ 868 - static void display_results(void) 676 + static void display_results(int psi_ret) 869 677 { 870 678 time_t now = time(NULL); 871 679 struct tm *tm_now = localtime(&now); ··· 878 686 suc &= BOOL_FPRINT(out, "\033[H\033[J"); 879 687 880 688 /* PSI output (one-line, no cat style) */ 881 - suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60/avg300/total)\n"); 882 - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 883 - "CPU some:", 884 - psi.cpu_some_avg10, 885 - psi.cpu_some_avg60, 886 - psi.cpu_some_avg300, 887 - psi.cpu_some_total / 1000); 888 - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 889 - "CPU full:", 890 - psi.cpu_full_avg10, 891 - psi.cpu_full_avg60, 892 - psi.cpu_full_avg300, 893 - psi.cpu_full_total / 1000); 894 - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 895 - "Memory full:", 896 - psi.memory_full_avg10, 897 - psi.memory_full_avg60, 898 - psi.memory_full_avg300, 899 - psi.memory_full_total / 1000); 900 - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 901 - "Memory some:", 902 - psi.memory_some_avg10, 903 - psi.memory_some_avg60, 904 - psi.memory_some_avg300, 905 - psi.memory_some_total / 1000); 906 - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 907 - "IO full:", 908 - psi.io_full_avg10, 909 - psi.io_full_avg60, 910 - psi.io_full_avg300, 911 - psi.io_full_total / 1000); 912 - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 913 - "IO some:", 914 - psi.io_some_avg10, 915 - psi.io_some_avg60, 916 - psi.io_some_avg300, 917 - psi.io_some_total / 1000); 918 - suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 919 - "IRQ full:", 920 - psi.irq_full_avg10, 921 - psi.irq_full_avg60, 922 - psi.irq_full_avg300, 923 - psi.irq_full_total / 1000); 689 + suc &= BOOL_FPRINT(out, "System Pressure Information: (avg10/avg60vg300/total)\n"); 690 + if (psi_ret) { 691 + suc &= BOOL_FPRINT(out, " PSI not found: check if psi=1 enabled in cmdline\n"); 692 + } else { 693 + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 694 + "CPU some:", 695 + psi.cpu_some_avg10, 696 + psi.cpu_some_avg60, 697 + psi.cpu_some_avg300, 698 + psi.cpu_some_total / 1000); 699 + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 700 + "CPU full:", 701 + psi.cpu_full_avg10, 702 + psi.cpu_full_avg60, 703 + psi.cpu_full_avg300, 704 + psi.cpu_full_total / 1000); 705 + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 706 + "Memory full:", 707 + psi.memory_full_avg10, 708 + psi.memory_full_avg60, 709 + psi.memory_full_avg300, 710 + psi.memory_full_total / 1000); 711 + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 712 + "Memory some:", 713 + psi.memory_some_avg10, 714 + psi.memory_some_avg60, 715 + psi.memory_some_avg300, 716 + psi.memory_some_total / 1000); 717 + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 718 + "IO full:", 719 + psi.io_full_avg10, 720 + psi.io_full_avg60, 721 + psi.io_full_avg300, 722 + psi.io_full_total / 1000); 723 + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 724 + "IO some:", 725 + psi.io_some_avg10, 726 + psi.io_some_avg60, 727 + psi.io_some_avg300, 728 + psi.io_some_total / 1000); 729 + suc &= BOOL_FPRINT(out, PSI_LINE_FORMAT, 730 + "IRQ full:", 731 + psi.irq_full_avg10, 732 + psi.irq_full_avg60, 733 + psi.irq_full_avg300, 734 + psi.irq_full_total / 1000); 735 + } 924 736 925 737 if (cfg.container_path) { 926 738 suc &= BOOL_FPRINT(out, "Container Information (%s):\n", cfg.container_path); ··· 934 738 container_stats.nr_stopped, container_stats.nr_uninterruptible, 935 739 container_stats.nr_io_wait); 936 740 } 937 - suc &= BOOL_FPRINT(out, "Top %d processes (sorted by CPU delay):\n", 938 - cfg.max_processes); 939 - suc &= BOOL_FPRINT(out, "%5s %5s %-17s", "PID", "TGID", "COMMAND"); 940 - suc &= BOOL_FPRINT(out, "%7s %7s %7s %7s %7s %7s %7s %7s\n", 941 - "CPU(ms)", "IO(ms)", "SWAP(ms)", "RCL(ms)", 942 - "THR(ms)", "CMP(ms)", "WP(ms)", "IRQ(ms)"); 943 741 944 - suc &= BOOL_FPRINT(out, "-----------------------------------------------"); 945 - suc &= BOOL_FPRINT(out, "----------------------------------------------\n"); 742 + /* Interacive command */ 743 + suc &= BOOL_FPRINT(out, "[o]sort [M]memverbose [q]quit\n"); 744 + if (sort_selected) { 745 + if (cfg.display_mode == MODE_MEMVERBOSE) 746 + suc &= BOOL_FPRINT(out, 747 + "sort selection: [m]MEM [r]RCL [t]THR [p]CMP [w]WP\n"); 748 + else 749 + suc &= BOOL_FPRINT(out, 750 + "sort selection: [c]CPU [i]IO [m]MEM [q]IRQ\n"); 751 + } 752 + 753 + /* Task delay output */ 754 + suc &= BOOL_FPRINT(out, "Top %d processes (sorted by %s delay):\n", 755 + cfg.max_processes, get_name_by_field(cfg.sort_field)); 756 + 757 + suc &= BOOL_FPRINT(out, "%8s %8s %-17s", "PID", "TGID", "COMMAND"); 758 + if (cfg.display_mode == MODE_MEMVERBOSE) { 759 + suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s %8s %8s\n", 760 + "MEM(ms)", "SWAP(ms)", "RCL(ms)", 761 + "THR(ms)", "CMP(ms)", "WP(ms)"); 762 + suc &= BOOL_FPRINT(out, "-----------------------"); 763 + suc &= BOOL_FPRINT(out, "-----------------------"); 764 + suc &= BOOL_FPRINT(out, "-----------------------"); 765 + suc &= BOOL_FPRINT(out, "---------------------\n"); 766 + } else { 767 + suc &= BOOL_FPRINT(out, "%8s %8s %8s %8s\n", 768 + "CPU(ms)", "IO(ms)", "IRQ(ms)", "MEM(ms)"); 769 + suc &= BOOL_FPRINT(out, "-----------------------"); 770 + suc &= BOOL_FPRINT(out, "-----------------------"); 771 + suc &= BOOL_FPRINT(out, "--------------------------\n"); 772 + } 773 + 946 774 count = task_count < cfg.max_processes ? task_count : cfg.max_processes; 947 775 948 776 for (i = 0; i < count; i++) { 949 - suc &= BOOL_FPRINT(out, "%5d %5d %-15s", 777 + suc &= BOOL_FPRINT(out, "%8d %8d %-15s", 950 778 tasks[i].pid, tasks[i].tgid, tasks[i].command); 951 - suc &= BOOL_FPRINT(out, "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f\n", 952 - average_ms(tasks[i].cpu_delay_total, tasks[i].cpu_count), 953 - average_ms(tasks[i].blkio_delay_total, tasks[i].blkio_count), 954 - average_ms(tasks[i].swapin_delay_total, tasks[i].swapin_count), 955 - average_ms(tasks[i].freepages_delay_total, tasks[i].freepages_count), 956 - average_ms(tasks[i].thrashing_delay_total, tasks[i].thrashing_count), 957 - average_ms(tasks[i].compact_delay_total, tasks[i].compact_count), 958 - average_ms(tasks[i].wpcopy_delay_total, tasks[i].wpcopy_count), 959 - average_ms(tasks[i].irq_delay_total, tasks[i].irq_count)); 779 + if (cfg.display_mode == MODE_MEMVERBOSE) { 780 + suc &= BOOL_FPRINT(out, DELAY_FMT_MEMVERBOSE, 781 + TASK_AVG(tasks[i], mem), 782 + TASK_AVG(tasks[i], swapin), 783 + TASK_AVG(tasks[i], freepages), 784 + TASK_AVG(tasks[i], thrashing), 785 + TASK_AVG(tasks[i], compact), 786 + TASK_AVG(tasks[i], wpcopy)); 787 + } else { 788 + suc &= BOOL_FPRINT(out, DELAY_FMT_DEFAULT, 789 + TASK_AVG(tasks[i], cpu), 790 + TASK_AVG(tasks[i], blkio), 791 + TASK_AVG(tasks[i], irq), 792 + TASK_AVG(tasks[i], mem)); 793 + } 960 794 } 961 795 962 796 suc &= BOOL_FPRINT(out, "\n"); ··· 995 769 perror("Error writing to output"); 996 770 } 997 771 772 + /* Check for keyboard input with timeout based on cfg.delay */ 773 + static char check_for_keypress(void) 774 + { 775 + struct timeval tv = {cfg.delay, 0}; 776 + fd_set readfds; 777 + char ch = 0; 778 + 779 + FD_ZERO(&readfds); 780 + FD_SET(STDIN_FILENO, &readfds); 781 + int r = select(STDIN_FILENO + 1, &readfds, NULL, NULL, &tv); 782 + 783 + if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { 784 + read(STDIN_FILENO, &ch, 1); 785 + return ch; 786 + } 787 + 788 + return 0; 789 + } 790 + 791 + #define MAX_MODE_SIZE 2 792 + static void toggle_display_mode(void) 793 + { 794 + static const size_t modes[MAX_MODE_SIZE] = {MODE_DEFAULT, MODE_MEMVERBOSE}; 795 + static size_t cur_index; 796 + 797 + cur_index = (cur_index + 1) % MAX_MODE_SIZE; 798 + cfg.display_mode = modes[cur_index]; 799 + } 800 + 801 + /* Handle keyboard input: sorting selection, mode toggle, or quit */ 802 + static void handle_keypress(char ch, int *running) 803 + { 804 + const struct field_desc *field; 805 + 806 + /* Change sort field */ 807 + if (sort_selected) { 808 + field = get_field_by_cmd_char(ch); 809 + if (field && (field->supported_modes & cfg.display_mode)) 810 + cfg.sort_field = field; 811 + 812 + sort_selected = 0; 813 + /* Handle mode changes or quit */ 814 + } else { 815 + switch (ch) { 816 + case 'o': 817 + sort_selected = 1; 818 + break; 819 + case 'M': 820 + toggle_display_mode(); 821 + for (field = sort_fields; field->name != NULL; field++) { 822 + if (field->supported_modes & cfg.display_mode) { 823 + cfg.sort_field = field; 824 + break; 825 + } 826 + } 827 + break; 828 + case 'q': 829 + case 'Q': 830 + *running = 0; 831 + break; 832 + default: 833 + break; 834 + } 835 + } 836 + } 837 + 998 838 /* Main function */ 999 839 int main(int argc, char **argv) 1000 840 { 841 + const struct field_desc *field; 1001 842 int iterations = 0; 1002 - int use_q_quit = 0; 843 + int psi_ret = 0; 844 + char keypress; 1003 845 1004 846 /* Parse command line arguments */ 1005 847 parse_args(argc, argv); ··· 1087 793 exit(1); 1088 794 } 1089 795 1090 - if (!cfg.output_one_time) { 1091 - use_q_quit = 1; 1092 - enable_raw_mode(); 1093 - printf("Press 'q' to quit.\n"); 1094 - fflush(stdout); 1095 - } 796 + /* Set terminal to non-canonical mode for interaction */ 797 + enable_raw_mode(); 1096 798 1097 799 /* Main loop */ 1098 800 while (running) { 801 + /* Auto-switch sort field when not matching display mode */ 802 + if (!(cfg.sort_field->supported_modes & cfg.display_mode)) { 803 + for (field = sort_fields; field->name != NULL; field++) { 804 + if (field->supported_modes & cfg.display_mode) { 805 + cfg.sort_field = field; 806 + printf("Auto-switched sort field to: %s\n", field->name); 807 + break; 808 + } 809 + } 810 + } 811 + 1099 812 /* Read PSI statistics */ 1100 - read_psi_stats(); 813 + psi_ret = read_psi_stats(); 1101 814 1102 815 /* Get container stats if container path provided */ 1103 816 if (cfg.container_path) ··· 1117 816 sort_tasks(); 1118 817 1119 818 /* Display results to stdout or log file */ 1120 - display_results(); 819 + display_results(psi_ret); 1121 820 1122 821 /* Check for iterations */ 1123 822 if (cfg.iterations > 0 && ++iterations >= cfg.iterations) ··· 1127 826 if (cfg.output_one_time) 1128 827 break; 1129 828 1130 - /* Check for 'q' key to quit */ 1131 - if (use_q_quit) { 1132 - struct timeval tv = {cfg.delay, 0}; 1133 - fd_set readfds; 1134 - 1135 - FD_ZERO(&readfds); 1136 - FD_SET(STDIN_FILENO, &readfds); 1137 - int r = select(STDIN_FILENO+1, &readfds, NULL, NULL, &tv); 1138 - 1139 - if (r > 0 && FD_ISSET(STDIN_FILENO, &readfds)) { 1140 - char ch = 0; 1141 - 1142 - read(STDIN_FILENO, &ch, 1); 1143 - if (ch == 'q' || ch == 'Q') { 1144 - running = 0; 1145 - break; 1146 - } 1147 - } 1148 - } else { 1149 - sleep(cfg.delay); 1150 - } 829 + /* Keypress for interactive usage */ 830 + keypress = check_for_keypress(); 831 + if (keypress) 832 + handle_keypress(keypress, &running); 1151 833 } 1152 834 1153 835 /* Restore terminal mode */ 1154 - if (use_q_quit) 1155 - disable_raw_mode(); 836 + disable_raw_mode(); 1156 837 1157 838 /* Cleanup */ 1158 839 close(nl_sd);

+7 -9

tools/testing/radix-tree/idr-test.c

··· 499 499 goto repeat; 500 500 } 501 501 502 - void ida_simple_get_remove_test(void) 502 + void ida_alloc_free_test(void) 503 503 { 504 504 DEFINE_IDA(ida); 505 505 unsigned long i; 506 506 507 - for (i = 0; i < 10000; i++) { 508 - assert(ida_simple_get(&ida, 0, 20000, GFP_KERNEL) == i); 509 - } 510 - assert(ida_simple_get(&ida, 5, 30, GFP_KERNEL) < 0); 507 + for (i = 0; i < 10000; i++) 508 + assert(ida_alloc_max(&ida, 20000, GFP_KERNEL) == i); 509 + assert(ida_alloc_range(&ida, 5, 30, GFP_KERNEL) < 0); 511 510 512 - for (i = 0; i < 10000; i++) { 513 - ida_simple_remove(&ida, i); 514 - } 511 + for (i = 0; i < 10000; i++) 512 + ida_free(&ida, i); 515 513 assert(ida_is_empty(&ida)); 516 514 517 515 ida_destroy(&ida); ··· 522 524 ida_check_nomem(); 523 525 ida_check_conv_user(); 524 526 ida_check_random(); 525 - ida_simple_get_remove_test(); 527 + ida_alloc_free_test(); 526 528 527 529 radix_tree_cpu_dead(1); 528 530 }

+1

tools/testing/selftests/proc/.gitignore

··· 7 7 /proc-loadavg-001 8 8 /proc-maps-race 9 9 /proc-multiple-procfs 10 + /proc-net-dev-lseek 10 11 /proc-empty-vm 11 12 /proc-pid-vm 12 13 /proc-self-map-files-001

+1

tools/testing/selftests/proc/Makefile

··· 10 10 TEST_GEN_PROGS += proc-2-is-kthread 11 11 TEST_GEN_PROGS += proc-loadavg-001 12 12 TEST_GEN_PROGS += proc-maps-race 13 + TEST_GEN_PROGS += proc-net-dev-lseek 13 14 TEST_GEN_PROGS += proc-empty-vm 14 15 TEST_GEN_PROGS += proc-pid-vm 15 16 TEST_GEN_PROGS += proc-self-map-files-001

+68

tools/testing/selftests/proc/proc-net-dev-lseek.c

··· 1 + /* 2 + * Copyright (c) 2025 Alexey Dobriyan <adobriyan@gmail.com> 3 + * 4 + * Permission to use, copy, modify, and distribute this software for any 5 + * purpose with or without fee is hereby granted, provided that the above 6 + * copyright notice and this permission notice appear in all copies. 7 + * 8 + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 + */ 16 + #undef _GNU_SOURCE 17 + #define _GNU_SOURCE 18 + #undef NDEBUG 19 + #include <assert.h> 20 + #include <errno.h> 21 + #include <fcntl.h> 22 + #include <string.h> 23 + #include <unistd.h> 24 + #include <sched.h> 25 + /* 26 + * Test that lseek("/proc/net/dev/", 0, SEEK_SET) 27 + * a) works, 28 + * b) does what you think it does. 29 + */ 30 + int main(void) 31 + { 32 + /* /proc/net/dev output is deterministic in fresh netns only. */ 33 + if (unshare(CLONE_NEWNET) == -1) { 34 + if (errno == ENOSYS || errno == EPERM) { 35 + return 4; 36 + } 37 + return 1; 38 + } 39 + 40 + const int fd = open("/proc/net/dev", O_RDONLY); 41 + assert(fd >= 0); 42 + 43 + char buf1[4096]; 44 + const ssize_t rv1 = read(fd, buf1, sizeof(buf1)); 45 + /* 46 + * Not "<=", this file can't be empty: 47 + * there is header, "lo" interface with some zeroes. 48 + */ 49 + assert(0 < rv1); 50 + assert(rv1 <= sizeof(buf1)); 51 + 52 + /* Believe it or not, this line broke one day. */ 53 + assert(lseek(fd, 0, SEEK_SET) == 0); 54 + 55 + char buf2[4096]; 56 + const ssize_t rv2 = read(fd, buf2, sizeof(buf2)); 57 + /* Not "<=", see above. */ 58 + assert(0 < rv2); 59 + assert(rv2 <= sizeof(buf2)); 60 + 61 + /* Test that lseek rewinds to the beginning of the file. */ 62 + assert(rv1 == rv2); 63 + assert(memcmp(buf1, buf2, rv1) == 0); 64 + 65 + /* Contents of the file is not validated: this test is about lseek(). */ 66 + 67 + return 0; 68 + }

+8 -4

tools/testing/selftests/proc/proc-pid-vm.c

··· 47 47 #include <sys/resource.h> 48 48 #include <linux/fs.h> 49 49 50 + #ifndef __maybe_unused 51 + #define __maybe_unused __attribute__((__unused__)) 52 + #endif 53 + 50 54 #include "../kselftest.h" 51 55 52 56 static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags) ··· 222 218 * 2: vsyscall VMA is r-xp vsyscall=emulate 223 219 */ 224 220 static volatile int g_vsyscall; 225 - static const char *str_vsyscall; 221 + static const char *str_vsyscall __maybe_unused; 226 222 227 - static const char str_vsyscall_0[] = ""; 228 - static const char str_vsyscall_1[] = 223 + static const char str_vsyscall_0[] __maybe_unused = ""; 224 + static const char str_vsyscall_1[] __maybe_unused = 229 225 "ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall]\n"; 230 - static const char str_vsyscall_2[] = 226 + static const char str_vsyscall_2[] __maybe_unused = 231 227 "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n"; 232 228 233 229 #ifdef __x86_64__