Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

um: insert scheduler ticks when userspace does not yield

In time-travel mode userspace can do a lot of work without any time
passing. Unfortunately, this can result in OOM situations as the RCU
core code will never be run.

Work around this by keeping track of userspace processes that do not
yield for a lot of operations. When this happens, insert a jiffie into
the sched_clock clock to account time against the process and cause the
bookkeeping to run.

As sched_clock is used for tracing, it is useful to keep it in sync
between the different VMs. As such, try to remove added ticks again when
the actual clock ticks.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20241010142537.1134685-1-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

authored by

Benjamin Berg and committed by
Johannes Berg
0b8b2668 9b088185

+66 -1
+15
arch/um/Kconfig
··· 227 227 228 228 It is safe to say Y, but you probably don't need this. 229 229 230 + config UML_MAX_USERSPACE_ITERATIONS 231 + int 232 + prompt "Maximum number of unscheduled userspace iterations" 233 + default 10000 234 + depends on UML_TIME_TRAVEL_SUPPORT 235 + help 236 + In UML inf-cpu and ext time-travel mode userspace can run without being 237 + interrupted. This will eventually overwhelm the kernel and create OOM 238 + situations (mainly RCU not running). This setting specifies the number 239 + of kernel/userspace switches (minor/major page fault, signal or syscall) 240 + for the same userspace thread before the sched_clock is advanced by a 241 + jiffie to trigger scheduling. 242 + 243 + Setting it to zero disables the feature. 244 + 230 245 config KASAN_SHADOW_OFFSET 231 246 hex 232 247 depends on KASAN
+5 -1
arch/um/include/shared/common-offsets.h
··· 28 28 #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT 29 29 DEFINE(UML_CONFIG_UML_TIME_TRAVEL_SUPPORT, CONFIG_UML_TIME_TRAVEL_SUPPORT); 30 30 #endif 31 - 31 + #ifdef CONFIG_UML_MAX_USERSPACE_ITERATIONS 32 + DEFINE(UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS, CONFIG_UML_MAX_USERSPACE_ITERATIONS); 33 + #else 34 + DEFINE(UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS, 0); 35 + #endif
+20
arch/um/kernel/time.c
··· 25 25 #include <shared/init.h> 26 26 27 27 #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT 28 + #include <linux/sched/clock.h> 29 + 28 30 enum time_travel_mode time_travel_mode; 29 31 EXPORT_SYMBOL_GPL(time_travel_mode); 30 32 ··· 48 46 static u16 time_travel_shm_id; 49 47 static struct um_timetravel_schedshm *time_travel_shm; 50 48 static union um_timetravel_schedshm_client *time_travel_shm_client; 49 + 50 + unsigned long tt_extra_sched_jiffies; 51 + 52 + notrace unsigned long long sched_clock(void) 53 + { 54 + return (unsigned long long)(jiffies - INITIAL_JIFFIES + 55 + tt_extra_sched_jiffies) 56 + * (NSEC_PER_SEC / HZ); 57 + } 51 58 52 59 static void time_travel_set_time(unsigned long long ns) 53 60 { ··· 454 443 { 455 444 time_travel_add_event(&time_travel_timer_event, 456 445 time_travel_time + time_travel_timer_interval); 446 + 447 + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ 448 + if (tt_extra_sched_jiffies > 0) 449 + tt_extra_sched_jiffies -= 1; 450 + 457 451 deliver_alarm(); 458 452 } 459 453 ··· 610 594 611 595 static void time_travel_oneshot_timer(struct time_travel_event *e) 612 596 { 597 + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ 598 + if (tt_extra_sched_jiffies > 0) 599 + tt_extra_sched_jiffies -= 1; 600 + 613 601 deliver_alarm(); 614 602 } 615 603
+26
arch/um/os-Linux/skas/process.c
··· 388 388 return err; 389 389 } 390 390 391 + int unscheduled_userspace_iterations; 392 + extern unsigned long tt_extra_sched_jiffies; 393 + 391 394 void userspace(struct uml_pt_regs *regs) 392 395 { 393 396 int err, status, op, pid = userspace_pid[0]; ··· 400 397 interrupt_end(); 401 398 402 399 while (1) { 400 + /* 401 + * When we are in time-travel mode, userspace can theoretically 402 + * do a *lot* of work without being scheduled. The problem with 403 + * this is that it will prevent kernel bookkeeping (primarily 404 + * the RCU) from running and this can for example cause OOM 405 + * situations. 406 + * 407 + * This code accounts a jiffie against the scheduling clock 408 + * after the defined userspace iterations in the same thread. 409 + * By doing so the situation is effectively prevented. 410 + */ 411 + if (time_travel_mode == TT_MODE_INFCPU || 412 + time_travel_mode == TT_MODE_EXTERNAL) { 413 + if (UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS && 414 + unscheduled_userspace_iterations++ > 415 + UML_CONFIG_UML_MAX_USERSPACE_ITERATIONS) { 416 + tt_extra_sched_jiffies += 1; 417 + unscheduled_userspace_iterations = 0; 418 + } 419 + } 420 + 403 421 time_travel_print_bc_msg(); 404 422 405 423 current_mm_sync(); ··· 563 539 564 540 void switch_threads(jmp_buf *me, jmp_buf *you) 565 541 { 542 + unscheduled_userspace_iterations = 0; 543 + 566 544 if (UML_SETJMP(me) == 0) 567 545 UML_LONGJMP(you, 1); 568 546 }