um: Add initial SMP support · tjh.dev/kernel@1e4ee51

+1 -1

Documentation/features/core/generic-idle-thread/arch-support.txt

··· 24 24 | s390: | ok | 25 25 | sh: | ok | 26 26 | sparc: | ok | 27 - | um: | TODO | 27 + | um: | ok | 28 28 | x86: | ok | 29 29 | xtensa: | ok | 30 30 -----------------------

+43 -3

arch/um/Kconfig

··· 28 28 select OF_EARLY_FLATTREE if OF 29 29 select GENERIC_IRQ_SHOW 30 30 select GENERIC_CPU_DEVICES 31 + select GENERIC_SMP_IDLE_THREAD 31 32 select HAVE_GCC_PLUGINS 32 33 select ARCH_SUPPORTS_LTO_CLANG 33 34 select ARCH_SUPPORTS_LTO_CLANG_THIN ··· 82 81 int 83 82 default 100 84 83 85 - config NR_CPUS 84 + config UML_SUBARCH_SUPPORTS_SMP 85 + bool 86 + 87 + config SMP 88 + bool "Symmetric multi-processing support" 89 + default n 90 + depends on UML_SUBARCH_SUPPORTS_SMP 91 + help 92 + This option enables UML SMP support. 93 + 94 + With this enabled, users can tell UML to start multiple virtual 95 + processors. Each virtual processor is represented as a separate 96 + host thread. 97 + 98 + In UML, kthreads and normal threads (when running in kernel mode) 99 + can be scheduled and executed simultaneously on different virtual 100 + processors. However, the userspace code of normal threads still 101 + runs within their respective single-threaded stubs. 102 + 103 + That is, SMP support is available both within the kernel and 104 + across different processes, but remains limited within threads 105 + of the same process in userspace. 106 + 107 + config NR_CPUS_RANGE_BEGIN 86 108 int 87 - range 1 1 88 - default 1 109 + default 1 if !SMP 110 + default 2 111 + 112 + config NR_CPUS_RANGE_END 113 + int 114 + default 1 if !SMP 115 + default 64 116 + 117 + config NR_CPUS_DEFAULT 118 + int 119 + default 1 if !SMP 120 + default 2 121 + 122 + config NR_CPUS 123 + int "Maximum number of CPUs" if SMP 124 + range NR_CPUS_RANGE_BEGIN NR_CPUS_RANGE_END 125 + default NR_CPUS_DEFAULT 89 126 90 127 source "arch/$(HEADER_ARCH)/um/Kconfig" 91 128 ··· 293 254 294 255 config ARCH_SUSPEND_POSSIBLE 295 256 def_bool y 257 + depends on !SMP 296 258 297 259 menu "Power management options" 298 260

+3 -2

arch/um/include/asm/current.h

··· 7 7 8 8 #ifndef __ASSEMBLER__ 9 9 10 + #include <shared/smp.h> 11 + 10 12 struct task_struct; 11 13 extern struct task_struct *cpu_tasks[NR_CPUS]; 12 14 13 15 static __always_inline struct task_struct *get_current(void) 14 16 { 15 - return cpu_tasks[0]; 17 + return cpu_tasks[uml_curr_cpu()]; 16 18 } 17 - 18 19 19 20 #define current get_current() 20 21

+23 -1

arch/um/include/asm/hardirq.h

··· 2 2 #ifndef __ASM_UM_HARDIRQ_H 3 3 #define __ASM_UM_HARDIRQ_H 4 4 5 - #include <asm-generic/hardirq.h> 5 + #include <linux/cache.h> 6 + #include <linux/threads.h> 6 7 7 8 #define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 9 + 10 + typedef struct { 11 + unsigned int __softirq_pending; 12 + #if IS_ENABLED(CONFIG_SMP) 13 + unsigned int irq_resched_count; 14 + unsigned int irq_call_count; 15 + #endif 16 + } ____cacheline_aligned irq_cpustat_t; 17 + 18 + DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); 19 + 20 + #define __ARCH_IRQ_STAT 21 + 22 + #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) 23 + 24 + #include <linux/irq.h> 25 + 26 + static inline void ack_bad_irq(unsigned int irq) 27 + { 28 + pr_crit("unexpected IRQ trap at vector %02x\n", irq); 29 + } 8 30 9 31 #endif /* __ASM_UM_HARDIRQ_H */

+10

arch/um/include/asm/mmu.h

··· 7 7 #define __ARCH_UM_MMU_H 8 8 9 9 #include "linux/types.h" 10 + #include <linux/mutex.h> 11 + #include <linux/spinlock.h> 10 12 #include <mm_id.h> 11 13 12 14 typedef struct mm_context { 13 15 struct mm_id id; 16 + struct mutex turnstile; 14 17 15 18 struct list_head list; 16 19 17 20 /* Address range in need of a TLB sync */ 21 + spinlock_t sync_tlb_lock; 18 22 unsigned long sync_tlb_range_from; 19 23 unsigned long sync_tlb_range_to; 20 24 } mm_context_t; 25 + 26 + #define INIT_MM_CONTEXT(mm) \ 27 + .context = { \ 28 + .turnstile = __MUTEX_INITIALIZER(mm.context.turnstile), \ 29 + .sync_tlb_lock = __SPIN_LOCK_INITIALIZER(mm.context.sync_tlb_lock), \ 30 + } 21 31 22 32 #endif

+2

arch/um/include/asm/pgtable.h

··· 225 225 static inline void um_tlb_mark_sync(struct mm_struct *mm, unsigned long start, 226 226 unsigned long end) 227 227 { 228 + guard(spinlock_irqsave)(&mm->context.sync_tlb_lock); 229 + 228 230 if (!mm->context.sync_tlb_range_to) { 229 231 mm->context.sync_tlb_range_from = start; 230 232 mm->context.sync_tlb_range_to = end;

+14 -1

arch/um/include/asm/smp.h

··· 2 2 #ifndef __UM_SMP_H 3 3 #define __UM_SMP_H 4 4 5 - #define hard_smp_processor_id() 0 5 + #if IS_ENABLED(CONFIG_SMP) 6 + 7 + #include <linux/cpumask.h> 8 + #include <shared/smp.h> 9 + 10 + #define raw_smp_processor_id() uml_curr_cpu() 11 + 12 + void arch_smp_send_reschedule(int cpu); 13 + 14 + void arch_send_call_function_single_ipi(int cpu); 15 + 16 + void arch_send_call_function_ipi_mask(const struct cpumask *mask); 17 + 18 + #endif /* CONFIG_SMP */ 6 19 7 20 #endif

+17

arch/um/include/linux/smp-internal.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef __UM_SMP_INTERNAL_H 3 + #define __UM_SMP_INTERNAL_H 4 + 5 + #if IS_ENABLED(CONFIG_SMP) 6 + 7 + void prefill_possible_map(void); 8 + 9 + #else /* !CONFIG_SMP */ 10 + 11 + static inline void prefill_possible_map(void) { } 12 + 13 + #endif /* CONFIG_SMP */ 14 + 15 + extern char cpu_irqstacks[NR_CPUS][THREAD_SIZE] __aligned(THREAD_SIZE); 16 + 17 + #endif /* __UM_SMP_INTERNAL_H */

+17

arch/um/include/shared/os.h

··· 216 216 217 217 void os_set_pdeathsig(void); 218 218 219 + int os_futex_wait(void *uaddr, unsigned int val); 220 + int os_futex_wake(void *uaddr); 221 + 219 222 /* execvp.c */ 220 223 extern int execvp_noalloc(char *buf, const char *file, char *const argv[]); 221 224 /* helper.c */ ··· 270 267 __attribute__ ((format (printf, 1, 2))); 271 268 272 269 /* time.c */ 270 + void os_idle_prepare(void); 273 271 extern void os_idle_sleep(void); 274 272 extern int os_timer_create(void); 275 273 extern int os_timer_set_interval(int cpu, unsigned long long nsecs); ··· 342 338 343 339 /* time-travel */ 344 340 extern void deliver_time_travel_irqs(void); 341 + 342 + /* smp.c */ 343 + #if IS_ENABLED(CONFIG_SMP) 344 + void os_init_smp(void); 345 + int os_start_cpu_thread(int cpu); 346 + void os_start_secondary(void *arg, jmp_buf *switch_buf); 347 + int os_send_ipi(int cpu, int vector); 348 + void os_local_ipi_enable(void); 349 + void os_local_ipi_disable(void); 350 + #else /* !CONFIG_SMP */ 351 + static inline void os_local_ipi_enable(void) { } 352 + static inline void os_local_ipi_disable(void) { } 353 + #endif /* CONFIG_SMP */ 345 354 346 355 #endif

+5

arch/um/include/shared/skas/mm_id.h

··· 6 6 #ifndef __MM_ID_H 7 7 #define __MM_ID_H 8 8 9 + #include <linux/compiler_types.h> 10 + 9 11 #define STUB_MAX_FDS 4 10 12 11 13 struct mm_id { ··· 20 18 int syscall_fd_num; 21 19 int syscall_fd_map[STUB_MAX_FDS]; 22 20 }; 21 + 22 + void enter_turnstile(struct mm_id *mm_id) __acquires(turnstile); 23 + void exit_turnstile(struct mm_id *mm_id) __releases(turnstile); 23 24 24 25 void notify_mm_kill(int pid); 25 26

+2

arch/um/include/shared/skas/skas.h

··· 15 15 extern unsigned long current_stub_stack(void); 16 16 extern struct mm_id *current_mm_id(void); 17 17 extern void current_mm_sync(void); 18 + void initial_jmpbuf_lock(void); 19 + void initial_jmpbuf_unlock(void); 18 20 19 21 #endif

+20

arch/um/include/shared/smp.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef __UM_SHARED_SMP_H 3 + #define __UM_SHARED_SMP_H 4 + 5 + #if IS_ENABLED(CONFIG_SMP) 6 + 7 + extern int uml_ncpus; 8 + 9 + int uml_curr_cpu(void); 10 + void uml_start_secondary(void *opaque); 11 + void uml_ipi_handler(int vector); 12 + 13 + #else /* !CONFIG_SMP */ 14 + 15 + #define uml_ncpus 1 16 + #define uml_curr_cpu() 0 17 + 18 + #endif /* CONFIG_SMP */ 19 + 20 + #endif /* __UM_SHARED_SMP_H */

+1

arch/um/kernel/Makefile

··· 25 25 obj-$(CONFIG_OF) += dtb.o 26 26 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 27 27 obj-$(CONFIG_STACKTRACE) += stacktrace.o 28 + obj-$(CONFIG_SMP) += smp.o 28 29 29 30 USER_OBJS := config.o 30 31

+25

arch/um/kernel/irq.c

··· 22 22 #include <irq_kern.h> 23 23 #include <linux/time-internal.h> 24 24 25 + DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); 26 + 27 + #define irq_stats(x) (&per_cpu(irq_stat, x)) 25 28 26 29 /* When epoll triggers we do not know why it did so 27 30 * we can also have different IRQs for read and write. ··· 703 700 struct uml_pt_regs *regs, void *mc) 704 701 { 705 702 do_IRQ(SIGCHLD_IRQ, regs); 703 + } 704 + 705 + /* 706 + * /proc/interrupts printing for arch specific interrupts 707 + */ 708 + int arch_show_interrupts(struct seq_file *p, int prec) 709 + { 710 + #if IS_ENABLED(CONFIG_SMP) 711 + int cpu; 712 + 713 + seq_printf(p, "%*s: ", prec, "RES"); 714 + for_each_online_cpu(cpu) 715 + seq_printf(p, "%10u ", irq_stats(cpu)->irq_resched_count); 716 + seq_puts(p, " Rescheduling interrupts\n"); 717 + 718 + seq_printf(p, "%*s: ", prec, "CAL"); 719 + for_each_online_cpu(cpu) 720 + seq_printf(p, "%10u ", irq_stats(cpu)->irq_call_count); 721 + seq_puts(p, " Function call interrupts\n"); 722 + #endif 723 + 724 + return 0; 706 725 }

+5

arch/um/kernel/process.c

··· 218 218 um_idle_sleep(); 219 219 } 220 220 221 + void arch_cpu_idle_prepare(void) 222 + { 223 + os_idle_prepare(); 224 + } 225 + 221 226 int __uml_cant_sleep(void) { 222 227 return in_atomic() || irqs_disabled() || in_interrupt(); 223 228 /* Is in_interrupt() really needed? */

+25 -8

arch/um/kernel/skas/mmu.c

··· 23 23 static spinlock_t mm_list_lock; 24 24 static struct list_head mm_list; 25 25 26 + void enter_turnstile(struct mm_id *mm_id) __acquires(turnstile) 27 + { 28 + struct mm_context *ctx = container_of(mm_id, struct mm_context, id); 29 + 30 + mutex_lock(&ctx->turnstile); 31 + } 32 + 33 + void exit_turnstile(struct mm_id *mm_id) __releases(turnstile) 34 + { 35 + struct mm_context *ctx = container_of(mm_id, struct mm_context, id); 36 + 37 + mutex_unlock(&ctx->turnstile); 38 + } 39 + 26 40 int init_new_context(struct task_struct *task, struct mm_struct *mm) 27 41 { 28 42 struct mm_id *new_id = &mm->context.id; 29 43 unsigned long stack = 0; 30 44 int ret = -ENOMEM; 31 45 46 + mutex_init(&mm->context.turnstile); 47 + spin_lock_init(&mm->context.sync_tlb_lock); 48 + 32 49 stack = __get_free_pages(GFP_KERNEL | __GFP_ZERO, ilog2(STUB_DATA_PAGES)); 33 50 if (stack == 0) 34 51 goto out; 35 52 36 53 new_id->stack = stack; 54 + new_id->syscall_data_len = 0; 55 + new_id->syscall_fd_num = 0; 37 56 38 57 scoped_guard(spinlock_irqsave, &mm_list_lock) { 39 58 /* Insert into list, used for lookups when the child dies */ ··· 92 73 return; 93 74 } 94 75 76 + scoped_guard(spinlock_irqsave, &mm_list_lock) 77 + list_del(&mm->context.list); 78 + 95 79 if (mmu->id.pid > 0) { 96 80 os_kill_ptraced_process(mmu->id.pid, 1); 97 81 mmu->id.pid = -1; ··· 104 82 os_close_file(mmu->id.sock); 105 83 106 84 free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); 107 - 108 - guard(spinlock_irqsave)(&mm_list_lock); 109 - 110 - list_del(&mm->context.list); 111 85 } 112 86 113 87 static irqreturn_t mm_sigchld_irq(int irq, void* dev) ··· 128 110 /* Marks the MM as dead */ 129 111 mm_context->id.pid = -1; 130 112 131 - /* 132 - * NOTE: If SMP is implemented, a futex_wake 133 - * needs to be added here. 134 - */ 135 113 stub_data = (void *)mm_context->id.stack; 136 114 stub_data->futex = FUTEX_IN_KERN; 115 + #if IS_ENABLED(CONFIG_SMP) 116 + os_futex_wake(&stub_data->futex); 117 + #endif 137 118 138 119 /* 139 120 * NOTE: Currently executing syscalls by

+16 -3

arch/um/kernel/skas/process.c

··· 7 7 #include <linux/sched/mm.h> 8 8 #include <linux/sched/task_stack.h> 9 9 #include <linux/sched/task.h> 10 + #include <linux/smp-internal.h> 10 11 11 12 #include <asm/tlbflush.h> 12 13 ··· 27 26 return 0; 28 27 } 29 28 30 - static char cpu0_irqstack[THREAD_SIZE] __aligned(THREAD_SIZE); 29 + char cpu_irqstacks[NR_CPUS][THREAD_SIZE] __aligned(THREAD_SIZE); 31 30 32 31 int __init start_uml(void) 33 32 { 34 - stack_protections((unsigned long) &cpu0_irqstack); 35 - set_sigstack(cpu0_irqstack, THREAD_SIZE); 33 + stack_protections((unsigned long) &cpu_irqstacks[0]); 34 + set_sigstack(cpu_irqstacks[0], THREAD_SIZE); 36 35 37 36 init_new_thread_signals(); 38 37 ··· 64 63 return; 65 64 66 65 um_tlb_sync(current->mm); 66 + } 67 + 68 + static DEFINE_SPINLOCK(initial_jmpbuf_spinlock); 69 + 70 + void initial_jmpbuf_lock(void) 71 + { 72 + spin_lock_irq(&initial_jmpbuf_spinlock); 73 + } 74 + 75 + void initial_jmpbuf_unlock(void) 76 + { 77 + spin_unlock_irq(&initial_jmpbuf_spinlock); 67 78 }

+242

arch/um/kernel/smp.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2025 Ant Group 4 + * Author: Tiwei Bie <tiwei.btw@antgroup.com> 5 + * 6 + * Based on the previous implementation in TT mode 7 + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) 8 + */ 9 + 10 + #include <linux/sched.h> 11 + #include <linux/sched/task.h> 12 + #include <linux/sched/task_stack.h> 13 + #include <linux/module.h> 14 + #include <linux/processor.h> 15 + #include <linux/threads.h> 16 + #include <linux/cpu.h> 17 + #include <linux/hardirq.h> 18 + #include <linux/smp.h> 19 + #include <linux/smp-internal.h> 20 + #include <init.h> 21 + #include <kern.h> 22 + #include <os.h> 23 + #include <smp.h> 24 + 25 + enum { 26 + UML_IPI_RES = 0, 27 + UML_IPI_CALL_SINGLE, 28 + UML_IPI_CALL, 29 + UML_IPI_STOP, 30 + }; 31 + 32 + void arch_smp_send_reschedule(int cpu) 33 + { 34 + os_send_ipi(cpu, UML_IPI_RES); 35 + } 36 + 37 + void arch_send_call_function_single_ipi(int cpu) 38 + { 39 + os_send_ipi(cpu, UML_IPI_CALL_SINGLE); 40 + } 41 + 42 + void arch_send_call_function_ipi_mask(const struct cpumask *mask) 43 + { 44 + int cpu; 45 + 46 + for_each_cpu(cpu, mask) 47 + os_send_ipi(cpu, UML_IPI_CALL); 48 + } 49 + 50 + void smp_send_stop(void) 51 + { 52 + int cpu, me = smp_processor_id(); 53 + 54 + for_each_online_cpu(cpu) { 55 + if (cpu == me) 56 + continue; 57 + os_send_ipi(cpu, UML_IPI_STOP); 58 + } 59 + } 60 + 61 + static void ipi_handler(int vector, struct uml_pt_regs *regs) 62 + { 63 + struct pt_regs *old_regs = set_irq_regs((struct pt_regs *)regs); 64 + int cpu = raw_smp_processor_id(); 65 + 66 + irq_enter(); 67 + 68 + if (current->mm) 69 + os_alarm_process(current->mm->context.id.pid); 70 + 71 + switch (vector) { 72 + case UML_IPI_RES: 73 + inc_irq_stat(irq_resched_count); 74 + scheduler_ipi(); 75 + break; 76 + 77 + case UML_IPI_CALL_SINGLE: 78 + inc_irq_stat(irq_call_count); 79 + generic_smp_call_function_single_interrupt(); 80 + break; 81 + 82 + case UML_IPI_CALL: 83 + inc_irq_stat(irq_call_count); 84 + generic_smp_call_function_interrupt(); 85 + break; 86 + 87 + case UML_IPI_STOP: 88 + set_cpu_online(cpu, false); 89 + while (1) 90 + pause(); 91 + break; 92 + 93 + default: 94 + pr_err("CPU#%d received unknown IPI (vector=%d)!\n", cpu, vector); 95 + break; 96 + } 97 + 98 + irq_exit(); 99 + set_irq_regs(old_regs); 100 + } 101 + 102 + void uml_ipi_handler(int vector) 103 + { 104 + struct uml_pt_regs r = { .is_user = 0 }; 105 + 106 + preempt_disable(); 107 + ipi_handler(vector, &r); 108 + preempt_enable(); 109 + } 110 + 111 + /* AP states used only during CPU startup */ 112 + enum { 113 + UML_CPU_PAUSED = 0, 114 + UML_CPU_RUNNING, 115 + }; 116 + 117 + static int cpu_states[NR_CPUS]; 118 + 119 + static int start_secondary(void *unused) 120 + { 121 + int err, cpu = raw_smp_processor_id(); 122 + 123 + notify_cpu_starting(cpu); 124 + set_cpu_online(cpu, true); 125 + 126 + err = um_setup_timer(); 127 + if (err) 128 + panic("CPU#%d failed to setup timer, err = %d", cpu, err); 129 + 130 + local_irq_enable(); 131 + 132 + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); 133 + 134 + return 0; 135 + } 136 + 137 + void uml_start_secondary(void *opaque) 138 + { 139 + int cpu = raw_smp_processor_id(); 140 + struct mm_struct *mm = &init_mm; 141 + struct task_struct *idle; 142 + 143 + stack_protections((unsigned long) &cpu_irqstacks[cpu]); 144 + set_sigstack(&cpu_irqstacks[cpu], THREAD_SIZE); 145 + 146 + set_cpu_present(cpu, true); 147 + os_futex_wait(&cpu_states[cpu], UML_CPU_PAUSED); 148 + 149 + smp_rmb(); /* paired with smp_wmb() in __cpu_up() */ 150 + 151 + idle = cpu_tasks[cpu]; 152 + idle->thread_info.cpu = cpu; 153 + 154 + mmgrab(mm); 155 + idle->active_mm = mm; 156 + 157 + idle->thread.request.thread.proc = start_secondary; 158 + idle->thread.request.thread.arg = NULL; 159 + 160 + new_thread(task_stack_page(idle), &idle->thread.switch_buf, 161 + new_thread_handler); 162 + os_start_secondary(opaque, &idle->thread.switch_buf); 163 + } 164 + 165 + void __init smp_prepare_cpus(unsigned int max_cpus) 166 + { 167 + int err, cpu, me = smp_processor_id(); 168 + unsigned long deadline; 169 + 170 + os_init_smp(); 171 + 172 + for_each_possible_cpu(cpu) { 173 + if (cpu == me) 174 + continue; 175 + 176 + pr_debug("Booting processor %d...\n", cpu); 177 + err = os_start_cpu_thread(cpu); 178 + if (err) { 179 + pr_crit("CPU#%d failed to start cpu thread, err = %d", 180 + cpu, err); 181 + continue; 182 + } 183 + 184 + deadline = jiffies + msecs_to_jiffies(1000); 185 + spin_until_cond(cpu_present(cpu) || 186 + time_is_before_jiffies(deadline)); 187 + 188 + if (!cpu_present(cpu)) 189 + pr_crit("CPU#%d failed to boot\n", cpu); 190 + } 191 + } 192 + 193 + int __cpu_up(unsigned int cpu, struct task_struct *tidle) 194 + { 195 + cpu_tasks[cpu] = tidle; 196 + smp_wmb(); /* paired with smp_rmb() in uml_start_secondary() */ 197 + cpu_states[cpu] = UML_CPU_RUNNING; 198 + os_futex_wake(&cpu_states[cpu]); 199 + spin_until_cond(cpu_online(cpu)); 200 + 201 + return 0; 202 + } 203 + 204 + void __init smp_cpus_done(unsigned int max_cpus) 205 + { 206 + } 207 + 208 + /* Set in uml_ncpus_setup */ 209 + int uml_ncpus = 1; 210 + 211 + void __init prefill_possible_map(void) 212 + { 213 + int cpu; 214 + 215 + for (cpu = 0; cpu < uml_ncpus; cpu++) 216 + set_cpu_possible(cpu, true); 217 + for (; cpu < NR_CPUS; cpu++) 218 + set_cpu_possible(cpu, false); 219 + } 220 + 221 + static int __init uml_ncpus_setup(char *line, int *add) 222 + { 223 + *add = 0; 224 + 225 + if (kstrtoint(line, 10, &uml_ncpus)) { 226 + os_warn("%s: Couldn't parse '%s'\n", __func__, line); 227 + return -1; 228 + } 229 + 230 + uml_ncpus = clamp(uml_ncpus, 1, NR_CPUS); 231 + 232 + return 0; 233 + } 234 + 235 + __uml_setup("ncpus=", uml_ncpus_setup, 236 + "ncpus=<# of desired CPUs>\n" 237 + " This tells UML how many virtual processors to start. The maximum\n" 238 + " number of supported virtual processors can be obtained by querying\n" 239 + " the CONFIG_NR_CPUS option using --showconfig.\n\n" 240 + ); 241 + 242 + EXPORT_SYMBOL(uml_curr_cpu);

+4 -1

arch/um/kernel/tlb.c

··· 162 162 { 163 163 pgd_t *pgd; 164 164 struct vm_ops ops; 165 - unsigned long addr = mm->context.sync_tlb_range_from, next; 165 + unsigned long addr, next; 166 166 int ret = 0; 167 + 168 + guard(spinlock_irqsave)(&mm->context.sync_tlb_lock); 167 169 168 170 if (mm->context.sync_tlb_range_to == 0) 169 171 return 0; ··· 179 177 ops.unmap = unmap; 180 178 } 181 179 180 + addr = mm->context.sync_tlb_range_from; 182 181 pgd = pgd_offset(mm, addr); 183 182 do { 184 183 next = pgd_addr_end(addr, mm->context.sync_tlb_range_to);

+1 -1

arch/um/kernel/trap.c

··· 316 316 if (!is_user && regs) 317 317 current->thread.segv_regs = container_of(regs, struct pt_regs, regs); 318 318 319 - if (!is_user && init_mm.context.sync_tlb_range_to) { 319 + if (!is_user && address >= start_vm && address < end_vm) { 320 320 /* 321 321 * Kernel has pending updates from set_ptes that were not 322 322 * flushed yet. Syncing them should fix the pagefault (if not

+23 -2

arch/um/kernel/um_arch.c

··· 19 19 #include <linux/kmsg_dump.h> 20 20 #include <linux/suspend.h> 21 21 #include <linux/random.h> 22 + #include <linux/smp-internal.h> 22 23 23 24 #include <asm/processor.h> 24 25 #include <asm/cpufeature.h> ··· 72 71 { 73 72 int i = 0; 74 73 74 + #if IS_ENABLED(CONFIG_SMP) 75 + i = (uintptr_t) v - 1; 76 + if (!cpu_online(i)) 77 + return 0; 78 + #endif 79 + 75 80 seq_printf(m, "processor\t: %d\n", i); 76 81 seq_printf(m, "vendor_id\t: User Mode Linux\n"); 77 82 seq_printf(m, "model name\t: UML\n"); ··· 94 87 loops_per_jiffy/(500000/HZ), 95 88 (loops_per_jiffy/(5000/HZ)) % 100); 96 89 97 - 98 90 return 0; 99 91 } 100 92 101 93 static void *c_start(struct seq_file *m, loff_t *pos) 102 94 { 103 - return *pos < nr_cpu_ids ? &boot_cpu_data + *pos : NULL; 95 + if (*pos < nr_cpu_ids) 96 + return (void *)(uintptr_t)(*pos + 1); 97 + return NULL; 104 98 } 105 99 106 100 static void *c_next(struct seq_file *m, void *v, loff_t *pos) ··· 417 409 strscpy(boot_command_line, command_line, COMMAND_LINE_SIZE); 418 410 *cmdline_p = command_line; 419 411 setup_hostinfo(host_info, sizeof host_info); 412 + prefill_possible_map(); 420 413 421 414 if (os_getrandom(rng_seed, sizeof(rng_seed), 0) == sizeof(rng_seed)) { 422 415 add_bootloader_randomness(rng_seed, sizeof(rng_seed)); ··· 451 442 void apply_alternatives(struct alt_instr *start, struct alt_instr *end) 452 443 { 453 444 } 445 + 446 + #if IS_ENABLED(CONFIG_SMP) 447 + void alternatives_smp_module_add(struct module *mod, char *name, 448 + void *locks, void *locks_end, 449 + void *text, void *text_end) 450 + { 451 + } 452 + 453 + void alternatives_smp_module_del(struct module *mod) 454 + { 455 + } 456 + #endif 454 457 455 458 void *text_poke(void *addr, const void *opcode, size_t len) 456 459 {

+3 -1

arch/um/os-Linux/Makefile

··· 16 16 17 17 obj-$(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) += elf_aux.o 18 18 19 + obj-$(CONFIG_SMP) += smp.o 20 + 19 21 USER_OBJS := $(user-objs-y) elf_aux.o execvp.o file.o helper.o irq.o \ 20 22 main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \ 21 - tty.o umid.o util.o 23 + tty.o umid.o util.o smp.o 22 24 23 25 include $(srctree)/arch/um/scripts/Makefile.rules

+8

arch/um/os-Linux/internal.h

··· 4 4 5 5 #include <mm_id.h> 6 6 #include <stub-data.h> 7 + #include <signal.h> 7 8 8 9 /* 9 10 * elf_aux.c ··· 19 18 /* 20 19 * signal.c 21 20 */ 21 + extern __thread int signals_enabled; 22 22 int timer_alarm_pending(void); 23 23 24 24 /* ··· 27 25 */ 28 26 void wait_stub_done(int pid); 29 27 void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys); 28 + 29 + /* 30 + * smp.c 31 + */ 32 + #define IPI_SIGNAL SIGRTMIN 33 + 30 34 #endif /* __UM_OS_LINUX_INTERNAL_H */

+20

arch/um/os-Linux/process.c

··· 10 10 #include <errno.h> 11 11 #include <signal.h> 12 12 #include <fcntl.h> 13 + #include <limits.h> 14 + #include <linux/futex.h> 13 15 #include <sys/mman.h> 14 16 #include <sys/ptrace.h> 15 17 #include <sys/prctl.h> ··· 190 188 void os_set_pdeathsig(void) 191 189 { 192 190 prctl(PR_SET_PDEATHSIG, SIGKILL); 191 + } 192 + 193 + int os_futex_wait(void *uaddr, unsigned int val) 194 + { 195 + int r; 196 + 197 + CATCH_EINTR(r = syscall(__NR_futex, uaddr, FUTEX_WAIT, val, 198 + NULL, NULL, 0)); 199 + return r < 0 ? -errno : r; 200 + } 201 + 202 + int os_futex_wake(void *uaddr) 203 + { 204 + int r; 205 + 206 + CATCH_EINTR(r = syscall(__NR_futex, uaddr, FUTEX_WAKE, INT_MAX, 207 + NULL, NULL, 0)); 208 + return r < 0 ? -errno : r; 193 209 }

+26 -5

arch/um/os-Linux/signal.c

··· 69 69 #define SIGCHLD_BIT 2 70 70 #define SIGCHLD_MASK (1 << SIGCHLD_BIT) 71 71 72 - static __thread int signals_enabled; 72 + __thread int signals_enabled; 73 73 #if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) 74 74 static int signals_blocked, signals_blocked_pending; 75 75 #endif ··· 259 259 return 0; 260 260 } 261 261 262 + static inline void __block_signals(void) 263 + { 264 + if (!signals_enabled) 265 + return; 266 + 267 + os_local_ipi_disable(); 268 + barrier(); 269 + signals_enabled = 0; 270 + } 271 + 272 + static inline void __unblock_signals(void) 273 + { 274 + if (signals_enabled) 275 + return; 276 + 277 + signals_enabled = 1; 278 + barrier(); 279 + os_local_ipi_enable(); 280 + } 281 + 262 282 void block_signals(void) 263 283 { 264 - signals_enabled = 0; 284 + __block_signals(); 265 285 /* 266 286 * This must return with signals disabled, so this barrier 267 287 * ensures that writes are flushed out before the return. ··· 298 278 if (signals_enabled == 1) 299 279 return; 300 280 301 - signals_enabled = 1; 281 + __unblock_signals(); 282 + 302 283 #if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) 303 284 deliver_time_travel_irqs(); 304 285 #endif ··· 333 312 * tracing that happens inside the handlers we call for the 334 313 * pending signals will mess up the tracing state. 335 314 */ 336 - signals_enabled = 0; 315 + __block_signals(); 337 316 um_trace_signals_off(); 338 317 339 318 /* ··· 365 344 366 345 /* Re-enable signals and trace that we're doing so. */ 367 346 um_trace_signals_on(); 368 - signals_enabled = 1; 347 + __unblock_signals(); 369 348 } 370 349 } 371 350

+25 -14

arch/um/os-Linux/skas/process.c

··· 546 546 void userspace(struct uml_pt_regs *regs) 547 547 { 548 548 int err, status, op; 549 - siginfo_t si_ptrace; 549 + siginfo_t si_local; 550 550 siginfo_t *si; 551 551 int sig; 552 552 ··· 555 555 556 556 while (1) { 557 557 struct mm_id *mm_id = current_mm_id(); 558 + 559 + /* 560 + * At any given time, only one CPU thread can enter the 561 + * turnstile to operate on the same stub process, including 562 + * executing stub system calls (mmap and munmap). 563 + */ 564 + enter_turnstile(mm_id); 558 565 559 566 /* 560 567 * When we are in time-travel mode, userspace can theoretically ··· 630 623 } 631 624 632 625 if (proc_data->si_offset > sizeof(proc_data->sigstack) - sizeof(*si)) 633 - panic("%s - Invalid siginfo offset from child", 634 - __func__); 635 - si = (void *)&proc_data->sigstack[proc_data->si_offset]; 626 + panic("%s - Invalid siginfo offset from child", __func__); 627 + 628 + si = &si_local; 629 + memcpy(si, &proc_data->sigstack[proc_data->si_offset], sizeof(*si)); 636 630 637 631 regs->is_user = 1; 638 632 ··· 729 721 case SIGFPE: 730 722 case SIGWINCH: 731 723 ptrace(PTRACE_GETSIGINFO, pid, 0, 732 - (struct siginfo *)&si_ptrace); 733 - si = &si_ptrace; 724 + (struct siginfo *)&si_local); 725 + si = &si_local; 734 726 break; 735 727 default: 736 728 si = NULL; ··· 740 732 sig = 0; 741 733 } 742 734 } 735 + 736 + exit_turnstile(mm_id); 743 737 744 738 UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ 745 739 ··· 812 802 813 803 static jmp_buf initial_jmpbuf; 814 804 815 - /* XXX Make these percpu */ 816 - static void (*cb_proc)(void *arg); 817 - static void *cb_arg; 818 - static jmp_buf *cb_back; 805 + static __thread void (*cb_proc)(void *arg); 806 + static __thread void *cb_arg; 807 + static __thread jmp_buf *cb_back; 819 808 820 809 int start_idle_thread(void *stack, jmp_buf *switch_buf) 821 810 { ··· 868 859 cb_arg = arg; 869 860 cb_back = &here; 870 861 871 - block_signals_trace(); 862 + initial_jmpbuf_lock(); 872 863 if (UML_SETJMP(&here) == 0) 873 864 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_CALLBACK); 874 - unblock_signals_trace(); 865 + initial_jmpbuf_unlock(); 875 866 876 867 cb_proc = NULL; 877 868 cb_arg = NULL; ··· 880 871 881 872 void halt_skas(void) 882 873 { 883 - block_signals_trace(); 874 + initial_jmpbuf_lock(); 884 875 UML_LONGJMP(&initial_jmpbuf, INIT_JMP_HALT); 876 + /* unreachable */ 885 877 } 886 878 887 879 static bool noreboot; ··· 902 892 903 893 void reboot_skas(void) 904 894 { 905 - block_signals_trace(); 895 + initial_jmpbuf_lock(); 906 896 UML_LONGJMP(&initial_jmpbuf, noreboot ? INIT_JMP_HALT : INIT_JMP_REBOOT); 897 + /* unreachable */ 907 898 }

+148

arch/um/os-Linux/smp.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2025 Ant Group 4 + * Author: Tiwei Bie <tiwei.btw@antgroup.com> 5 + */ 6 + 7 + #include <errno.h> 8 + #include <pthread.h> 9 + #include <signal.h> 10 + #include <kern_util.h> 11 + #include <um_malloc.h> 12 + #include <init.h> 13 + #include <os.h> 14 + #include <smp.h> 15 + #include "internal.h" 16 + 17 + struct cpu_thread_data { 18 + int cpu; 19 + sigset_t sigset; 20 + }; 21 + 22 + static __thread int __curr_cpu; 23 + 24 + int uml_curr_cpu(void) 25 + { 26 + return __curr_cpu; 27 + } 28 + 29 + static pthread_t cpu_threads[CONFIG_NR_CPUS]; 30 + 31 + static void *cpu_thread(void *arg) 32 + { 33 + struct cpu_thread_data *data = arg; 34 + 35 + __curr_cpu = data->cpu; 36 + 37 + uml_start_secondary(data); 38 + 39 + return NULL; 40 + } 41 + 42 + int os_start_cpu_thread(int cpu) 43 + { 44 + struct cpu_thread_data *data; 45 + sigset_t sigset, oset; 46 + int err; 47 + 48 + data = uml_kmalloc(sizeof(*data), UM_GFP_ATOMIC); 49 + if (!data) 50 + return -ENOMEM; 51 + 52 + sigfillset(&sigset); 53 + if (sigprocmask(SIG_SETMASK, &sigset, &oset) < 0) { 54 + err = errno; 55 + goto err; 56 + } 57 + 58 + data->cpu = cpu; 59 + data->sigset = oset; 60 + 61 + err = pthread_create(&cpu_threads[cpu], NULL, cpu_thread, data); 62 + if (sigprocmask(SIG_SETMASK, &oset, NULL) < 0) 63 + panic("Failed to restore the signal mask, errno = %d", errno); 64 + if (err != 0) 65 + goto err; 66 + 67 + return 0; 68 + 69 + err: 70 + kfree(data); 71 + return -err; 72 + } 73 + 74 + void os_start_secondary(void *arg, jmp_buf *switch_buf) 75 + { 76 + struct cpu_thread_data *data = arg; 77 + 78 + sigaddset(&data->sigset, IPI_SIGNAL); 79 + sigaddset(&data->sigset, SIGIO); 80 + 81 + if (sigprocmask(SIG_SETMASK, &data->sigset, NULL) < 0) 82 + panic("Failed to restore the signal mask, errno = %d", errno); 83 + 84 + kfree(data); 85 + longjmp(*switch_buf, 1); 86 + 87 + /* unreachable */ 88 + printk(UM_KERN_ERR "impossible long jump!"); 89 + fatal_sigsegv(); 90 + } 91 + 92 + int os_send_ipi(int cpu, int vector) 93 + { 94 + union sigval value = { .sival_int = vector }; 95 + 96 + return pthread_sigqueue(cpu_threads[cpu], IPI_SIGNAL, value); 97 + } 98 + 99 + static void __local_ipi_set(int enable) 100 + { 101 + sigset_t sigset; 102 + 103 + sigemptyset(&sigset); 104 + sigaddset(&sigset, IPI_SIGNAL); 105 + 106 + if (sigprocmask(enable ? SIG_UNBLOCK : SIG_BLOCK, &sigset, NULL) < 0) 107 + panic("%s: sigprocmask failed, errno = %d", __func__, errno); 108 + } 109 + 110 + void os_local_ipi_enable(void) 111 + { 112 + __local_ipi_set(1); 113 + } 114 + 115 + void os_local_ipi_disable(void) 116 + { 117 + __local_ipi_set(0); 118 + } 119 + 120 + static void ipi_sig_handler(int sig, siginfo_t *si, void *uc) 121 + { 122 + int save_errno = errno; 123 + 124 + signals_enabled = 0; 125 + um_trace_signals_off(); 126 + 127 + uml_ipi_handler(si->si_value.sival_int); 128 + 129 + um_trace_signals_on(); 130 + signals_enabled = 1; 131 + 132 + errno = save_errno; 133 + } 134 + 135 + void __init os_init_smp(void) 136 + { 137 + struct sigaction action = { 138 + .sa_sigaction = ipi_sig_handler, 139 + .sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART, 140 + }; 141 + 142 + sigfillset(&action.sa_mask); 143 + 144 + if (sigaction(IPI_SIGNAL, &action, NULL) < 0) 145 + panic("%s: sigaction failed, errno = %d", __func__, errno); 146 + 147 + cpu_threads[0] = pthread_self(); 148 + }

+4

arch/um/os-Linux/start_up.c

··· 22 22 #include <asm/unistd.h> 23 23 #include <init.h> 24 24 #include <os.h> 25 + #include <smp.h> 25 26 #include <kern_util.h> 26 27 #include <mem_user.h> 27 28 #include <ptrace_user.h> ··· 481 480 if (seccomp_config == 2) 482 481 fatal("SECCOMP userspace requested but not functional!\n"); 483 482 } 483 + 484 + if (uml_ncpus > 1) 485 + fatal("SMP is not supported with PTRACE userspace.\n"); 484 486 485 487 using_seccomp = 0; 486 488 check_ptrace();

+33 -5

arch/um/os-Linux/time.c

··· 11 11 #include <errno.h> 12 12 #include <signal.h> 13 13 #include <time.h> 14 + #include <sys/signalfd.h> 14 15 #include <sys/time.h> 15 16 #include <kern_util.h> 16 17 #include <os.h> 18 + #include <smp.h> 17 19 #include <string.h> 18 20 #include "internal.h" 19 21 ··· 43 41 */ 44 42 int os_timer_create(void) 45 43 { 46 - timer_t *t = &event_high_res_timer[0]; 44 + int cpu = uml_curr_cpu(); 45 + timer_t *t = &event_high_res_timer[cpu]; 47 46 struct sigevent sev = { 48 47 .sigev_notify = SIGEV_THREAD_ID, 49 48 .sigev_signo = SIGALRM, ··· 108 105 return timespec_to_ns(&ts); 109 106 } 110 107 108 + static __thread int wake_signals; 109 + 110 + void os_idle_prepare(void) 111 + { 112 + sigset_t set; 113 + 114 + sigemptyset(&set); 115 + sigaddset(&set, SIGALRM); 116 + sigaddset(&set, IPI_SIGNAL); 117 + 118 + /* 119 + * We need to use signalfd rather than sigsuspend in idle sleep 120 + * because the IPI signal is a real-time signal that carries data, 121 + * and unlike handling SIGALRM, we cannot simply flag it in 122 + * signals_pending. 123 + */ 124 + wake_signals = signalfd(-1, &set, SFD_CLOEXEC); 125 + if (wake_signals < 0) 126 + panic("Failed to create signal FD, errno = %d", errno); 127 + } 128 + 111 129 /** 112 130 * os_idle_sleep() - sleep until interrupted 113 131 */ 114 132 void os_idle_sleep(void) 115 133 { 116 - sigset_t set, old; 134 + sigset_t set; 117 135 118 - /* Block SIGALRM while performing the need_resched check. */ 136 + /* 137 + * Block SIGALRM while performing the need_resched check. 138 + * Note that, because IRQs are disabled, the IPI signal is 139 + * already blocked. 140 + */ 119 141 sigemptyset(&set); 120 142 sigaddset(&set, SIGALRM); 121 - sigprocmask(SIG_BLOCK, &set, &old); 143 + sigprocmask(SIG_BLOCK, &set, NULL); 122 144 123 145 /* 124 146 * Because disabling IRQs does not block SIGALRM, it is also 125 147 * necessary to check for any pending timer alarms. 126 148 */ 127 149 if (!uml_need_resched() && !timer_alarm_pending()) 128 - sigsuspend(&old); 150 + os_poll(1, &wake_signals); 129 151 130 152 /* Restore the signal mask. */ 131 153 sigprocmask(SIG_UNBLOCK, &set, NULL);