at master 19 kB view raw
1/* SPDX-License-Identifier: GPL-2.0 */ 2#ifndef _LINUX_RSEQ_ENTRY_H 3#define _LINUX_RSEQ_ENTRY_H 4 5/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ 6#ifdef CONFIG_RSEQ_STATS 7#include <linux/percpu.h> 8 9struct rseq_stats { 10 unsigned long exit; 11 unsigned long signal; 12 unsigned long slowpath; 13 unsigned long fastpath; 14 unsigned long ids; 15 unsigned long cs; 16 unsigned long clear; 17 unsigned long fixup; 18}; 19 20DECLARE_PER_CPU(struct rseq_stats, rseq_stats); 21 22/* 23 * Slow path has interrupts and preemption enabled, but the fast path 24 * runs with interrupts disabled so there is no point in having the 25 * preemption checks implied in __this_cpu_inc() for every operation. 26 */ 27#ifdef RSEQ_BUILD_SLOW_PATH 28#define rseq_stat_inc(which) this_cpu_inc((which)) 29#else 30#define rseq_stat_inc(which) raw_cpu_inc((which)) 31#endif 32 33#else /* CONFIG_RSEQ_STATS */ 34#define rseq_stat_inc(x) do { } while (0) 35#endif /* !CONFIG_RSEQ_STATS */ 36 37#ifdef CONFIG_RSEQ 38#include <linux/jump_label.h> 39#include <linux/rseq.h> 40#include <linux/uaccess.h> 41 42#include <linux/tracepoint-defs.h> 43 44#ifdef CONFIG_TRACEPOINTS 45DECLARE_TRACEPOINT(rseq_update); 46DECLARE_TRACEPOINT(rseq_ip_fixup); 47void __rseq_trace_update(struct task_struct *t); 48void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 49 unsigned long offset, unsigned long abort_ip); 50 51static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) 52{ 53 if (tracepoint_enabled(rseq_update) && ids) 54 __rseq_trace_update(t); 55} 56 57static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 58 unsigned long offset, unsigned long abort_ip) 59{ 60 if (tracepoint_enabled(rseq_ip_fixup)) 61 __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 62} 63 64#else /* CONFIG_TRACEPOINT */ 65static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { } 66static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 67 unsigned long offset, unsigned long abort_ip) { } 68#endif /* !CONFIG_TRACEPOINT */ 69 70DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 71 72#ifdef RSEQ_BUILD_SLOW_PATH 73#define rseq_inline 74#else 75#define rseq_inline __always_inline 76#endif 77 78bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); 79bool rseq_debug_validate_ids(struct task_struct *t); 80 81static __always_inline void rseq_note_user_irq_entry(void) 82{ 83 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) 84 current->rseq.event.user_irq = true; 85} 86 87/* 88 * Check whether there is a valid critical section and whether the 89 * instruction pointer in @regs is inside the critical section. 90 * 91 * - If the critical section is invalid, terminate the task. 92 * 93 * - If valid and the instruction pointer is inside, set it to the abort IP. 94 * 95 * - If valid and the instruction pointer is outside, clear the critical 96 * section address. 97 * 98 * Returns true, if the section was valid and either fixup or clear was 99 * done, false otherwise. 100 * 101 * In the failure case task::rseq_event::fatal is set when a invalid 102 * section was found. It's clear when the failure was an unresolved page 103 * fault. 104 * 105 * If inlined into the exit to user path with interrupts disabled, the 106 * caller has to protect against page faults with pagefault_disable(). 107 * 108 * In preemptible task context this would be counterproductive as the page 109 * faults could not be fully resolved. As a consequence unresolved page 110 * faults in task context are fatal too. 111 */ 112 113#ifdef RSEQ_BUILD_SLOW_PATH 114/* 115 * The debug version is put out of line, but kept here so the code stays 116 * together. 117 * 118 * @csaddr has already been checked by the caller to be in user space 119 */ 120bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, 121 unsigned long csaddr) 122{ 123 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 124 u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; 125 unsigned long ip = instruction_pointer(regs); 126 u64 __user *uc_head = (u64 __user *) ucs; 127 u32 usig, __user *uc_sig; 128 129 scoped_user_rw_access(ucs, efault) { 130 /* 131 * Evaluate the user pile and exit if one of the conditions 132 * is not fulfilled. 133 */ 134 unsafe_get_user(start_ip, &ucs->start_ip, efault); 135 if (unlikely(start_ip >= tasksize)) 136 goto die; 137 /* If outside, just clear the critical section. */ 138 if (ip < start_ip) 139 goto clear; 140 141 unsafe_get_user(offset, &ucs->post_commit_offset, efault); 142 cs_end = start_ip + offset; 143 /* Check for overflow and wraparound */ 144 if (unlikely(cs_end >= tasksize || cs_end < start_ip)) 145 goto die; 146 147 /* If not inside, clear it. */ 148 if (ip >= cs_end) 149 goto clear; 150 151 unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 152 /* Ensure it's "valid" */ 153 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) 154 goto die; 155 /* Validate that the abort IP is not in the critical section */ 156 if (unlikely(abort_ip - start_ip < offset)) 157 goto die; 158 159 /* 160 * Check version and flags for 0. No point in emitting 161 * deprecated warnings before dying. That could be done in 162 * the slow path eventually, but *shrug*. 163 */ 164 unsafe_get_user(head, uc_head, efault); 165 if (unlikely(head)) 166 goto die; 167 168 /* abort_ip - 4 is >= 0. See abort_ip check above */ 169 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 170 unsafe_get_user(usig, uc_sig, efault); 171 if (unlikely(usig != t->rseq.sig)) 172 goto die; 173 174 /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ 175 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 176 /* If not in interrupt from user context, let it die */ 177 if (unlikely(!t->rseq.event.user_irq)) 178 goto die; 179 } 180 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 181 instruction_pointer_set(regs, (unsigned long)abort_ip); 182 rseq_stat_inc(rseq_stats.fixup); 183 break; 184 clear: 185 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 186 rseq_stat_inc(rseq_stats.clear); 187 abort_ip = 0ULL; 188 } 189 190 if (unlikely(abort_ip)) 191 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 192 return true; 193die: 194 t->rseq.event.fatal = true; 195efault: 196 return false; 197} 198 199/* 200 * On debug kernels validate that user space did not mess with it if the 201 * debug branch is enabled. 202 */ 203bool rseq_debug_validate_ids(struct task_struct *t) 204{ 205 struct rseq __user *rseq = t->rseq.usrptr; 206 u32 cpu_id, uval, node_id; 207 208 /* 209 * On the first exit after registering the rseq region CPU ID is 210 * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! 211 */ 212 node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? 213 cpu_to_node(t->rseq.ids.cpu_id) : 0; 214 215 scoped_user_read_access(rseq, efault) { 216 unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); 217 if (cpu_id != t->rseq.ids.cpu_id) 218 goto die; 219 unsafe_get_user(uval, &rseq->cpu_id, efault); 220 if (uval != cpu_id) 221 goto die; 222 unsafe_get_user(uval, &rseq->node_id, efault); 223 if (uval != node_id) 224 goto die; 225 unsafe_get_user(uval, &rseq->mm_cid, efault); 226 if (uval != t->rseq.ids.mm_cid) 227 goto die; 228 } 229 return true; 230die: 231 t->rseq.event.fatal = true; 232efault: 233 return false; 234} 235 236#endif /* RSEQ_BUILD_SLOW_PATH */ 237 238/* 239 * This only ensures that abort_ip is in the user address space and 240 * validates that it is preceded by the signature. 241 * 242 * No other sanity checks are done here, that's what the debug code is for. 243 */ 244static rseq_inline bool 245rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) 246{ 247 struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 248 unsigned long ip = instruction_pointer(regs); 249 unsigned long tasksize = TASK_SIZE; 250 u64 start_ip, abort_ip, offset; 251 u32 usig, __user *uc_sig; 252 253 rseq_stat_inc(rseq_stats.cs); 254 255 if (unlikely(csaddr >= tasksize)) { 256 t->rseq.event.fatal = true; 257 return false; 258 } 259 260 if (static_branch_unlikely(&rseq_debug_enabled)) 261 return rseq_debug_update_user_cs(t, regs, csaddr); 262 263 scoped_user_rw_access(ucs, efault) { 264 unsafe_get_user(start_ip, &ucs->start_ip, efault); 265 unsafe_get_user(offset, &ucs->post_commit_offset, efault); 266 unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 267 268 /* 269 * No sanity checks. If user space screwed it up, it can 270 * keep the pieces. That's what debug code is for. 271 * 272 * If outside, just clear the critical section. 273 */ 274 if (ip - start_ip >= offset) 275 goto clear; 276 277 /* 278 * Two requirements for @abort_ip: 279 * - Must be in user space as x86 IRET would happily return to 280 * the kernel. 281 * - The four bytes preceding the instruction at @abort_ip must 282 * contain the signature. 283 * 284 * The latter protects against the following attack vector: 285 * 286 * An attacker with limited abilities to write, creates a critical 287 * section descriptor, sets the abort IP to a library function or 288 * some other ROP gadget and stores the address of the descriptor 289 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP 290 * protection. 291 */ 292 if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) 293 goto die; 294 295 /* The address is guaranteed to be >= 0 and < TASK_SIZE */ 296 uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 297 unsafe_get_user(usig, uc_sig, efault); 298 if (unlikely(usig != t->rseq.sig)) 299 goto die; 300 301 /* Invalidate the critical section */ 302 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 303 /* Update the instruction pointer */ 304 instruction_pointer_set(regs, (unsigned long)abort_ip); 305 rseq_stat_inc(rseq_stats.fixup); 306 break; 307 clear: 308 unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 309 rseq_stat_inc(rseq_stats.clear); 310 abort_ip = 0ULL; 311 } 312 313 if (unlikely(abort_ip)) 314 rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 315 return true; 316die: 317 t->rseq.event.fatal = true; 318efault: 319 return false; 320} 321 322/* 323 * Updates CPU ID, Node ID and MM CID and reads the critical section 324 * address, when @csaddr != NULL. This allows to put the ID update and the 325 * read under the same uaccess region to spare a separate begin/end. 326 * 327 * As this is either invoked from a C wrapper with @csaddr = NULL or from 328 * the fast path code with a valid pointer, a clever compiler should be 329 * able to optimize the read out. Spares a duplicate implementation. 330 * 331 * Returns true, if the operation was successful, false otherwise. 332 * 333 * In the failure case task::rseq_event::fatal is set when invalid data 334 * was found on debug kernels. It's clear when the failure was an unresolved page 335 * fault. 336 * 337 * If inlined into the exit to user path with interrupts disabled, the 338 * caller has to protect against page faults with pagefault_disable(). 339 * 340 * In preemptible task context this would be counterproductive as the page 341 * faults could not be fully resolved. As a consequence unresolved page 342 * faults in task context are fatal too. 343 */ 344static rseq_inline 345bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, 346 u32 node_id, u64 *csaddr) 347{ 348 struct rseq __user *rseq = t->rseq.usrptr; 349 350 if (static_branch_unlikely(&rseq_debug_enabled)) { 351 if (!rseq_debug_validate_ids(t)) 352 return false; 353 } 354 355 scoped_user_rw_access(rseq, efault) { 356 unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); 357 unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); 358 unsafe_put_user(node_id, &rseq->node_id, efault); 359 unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); 360 if (csaddr) 361 unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); 362 } 363 364 /* Cache the new values */ 365 t->rseq.ids.cpu_cid = ids->cpu_cid; 366 rseq_stat_inc(rseq_stats.ids); 367 rseq_trace_update(t, ids); 368 return true; 369efault: 370 return false; 371} 372 373/* 374 * Update user space with new IDs and conditionally check whether the task 375 * is in a critical section. 376 */ 377static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, 378 struct rseq_ids *ids, u32 node_id) 379{ 380 u64 csaddr; 381 382 if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) 383 return false; 384 385 /* 386 * On architectures which utilize the generic entry code this 387 * allows to skip the critical section when the entry was not from 388 * a user space interrupt, unless debug mode is enabled. 389 */ 390 if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 391 if (!static_branch_unlikely(&rseq_debug_enabled)) { 392 if (likely(!t->rseq.event.user_irq)) 393 return true; 394 } 395 } 396 if (likely(!csaddr)) 397 return true; 398 /* Sigh, this really needs to do work */ 399 return rseq_update_user_cs(t, regs, csaddr); 400} 401 402/* 403 * If you want to use this then convert your architecture to the generic 404 * entry code. I'm tired of building workarounds for people who can't be 405 * bothered to make the maintenance of generic infrastructure less 406 * burdensome. Just sucking everything into the architecture code and 407 * thereby making others chase the horrible hacks and keep them working is 408 * neither acceptable nor sustainable. 409 */ 410#ifdef CONFIG_GENERIC_ENTRY 411 412/* 413 * This is inlined into the exit path because: 414 * 415 * 1) It's a one time comparison in the fast path when there is no event to 416 * handle 417 * 418 * 2) The access to the user space rseq memory (TLS) is unlikely to fault 419 * so the straight inline operation is: 420 * 421 * - Four 32-bit stores only if CPU ID/ MM CID need to be updated 422 * - One 64-bit load to retrieve the critical section address 423 * 424 * 3) In the unlikely case that the critical section address is != NULL: 425 * 426 * - One 64-bit load to retrieve the start IP 427 * - One 64-bit load to retrieve the offset for calculating the end 428 * - One 64-bit load to retrieve the abort IP 429 * - One 64-bit load to retrieve the signature 430 * - One store to clear the critical section address 431 * 432 * The non-debug case implements only the minimal required checking. It 433 * provides protection against a rogue abort IP in kernel space, which 434 * would be exploitable at least on x86, and also against a rogue CS 435 * descriptor by checking the signature at the abort IP. Any fallout from 436 * invalid critical section descriptors is a user space problem. The debug 437 * case provides the full set of checks and terminates the task if a 438 * condition is not met. 439 * 440 * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and 441 * tells the caller to loop back into exit_to_user_mode_loop(). The rseq 442 * slow path there will handle the failure. 443 */ 444static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t) 445{ 446 /* 447 * Page faults need to be disabled as this is called with 448 * interrupts disabled 449 */ 450 guard(pagefault)(); 451 if (likely(!t->rseq.event.ids_changed)) { 452 struct rseq __user *rseq = t->rseq.usrptr; 453 /* 454 * If IDs have not changed rseq_event::user_irq must be true 455 * See rseq_sched_switch_event(). 456 */ 457 u64 csaddr; 458 459 if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs))) 460 return false; 461 462 if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { 463 if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) 464 return false; 465 } 466 return true; 467 } 468 469 struct rseq_ids ids = { 470 .cpu_id = task_cpu(t), 471 .mm_cid = task_mm_cid(t), 472 }; 473 u32 node_id = cpu_to_node(ids.cpu_id); 474 475 return rseq_update_usr(t, regs, &ids, node_id); 476} 477 478static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) 479{ 480 struct task_struct *t = current; 481 482 /* 483 * If the task did not go through schedule or got the flag enforced 484 * by the rseq syscall or execve, then nothing to do here. 485 * 486 * CPU ID and MM CID can only change when going through a context 487 * switch. 488 * 489 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit 490 * only when rseq_event::has_rseq is true. That conditional is 491 * required to avoid setting the TIF bit if RSEQ is not registered 492 * for a task. rseq_event::sched_switch is cleared when RSEQ is 493 * unregistered by a task so it's sufficient to check for the 494 * sched_switch bit alone. 495 * 496 * A sane compiler requires three instructions for the nothing to do 497 * case including clearing the events, but your mileage might vary. 498 */ 499 if (unlikely((t->rseq.event.sched_switch))) { 500 rseq_stat_inc(rseq_stats.fastpath); 501 502 if (unlikely(!rseq_exit_user_update(regs, t))) 503 return true; 504 } 505 /* Clear state so next entry starts from a clean slate */ 506 t->rseq.event.events = 0; 507 return false; 508} 509 510/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ 511#ifdef CONFIG_HAVE_GENERIC_TIF_BITS 512static __always_inline bool test_tif_rseq(unsigned long ti_work) 513{ 514 return ti_work & _TIF_RSEQ; 515} 516 517static __always_inline void clear_tif_rseq(void) 518{ 519 static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); 520 clear_thread_flag(TIF_RSEQ); 521} 522#else 523static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } 524static __always_inline void clear_tif_rseq(void) { } 525#endif 526 527static __always_inline bool 528rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 529{ 530 if (likely(!test_tif_rseq(ti_work))) 531 return false; 532 533 if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { 534 current->rseq.event.slowpath = true; 535 set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); 536 return true; 537 } 538 539 clear_tif_rseq(); 540 return false; 541} 542 543#else /* CONFIG_GENERIC_ENTRY */ 544static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 545{ 546 return false; 547} 548#endif /* !CONFIG_GENERIC_ENTRY */ 549 550static __always_inline void rseq_syscall_exit_to_user_mode(void) 551{ 552 struct rseq_event *ev = &current->rseq.event; 553 554 rseq_stat_inc(rseq_stats.exit); 555 556 /* Needed to remove the store for the !lockdep case */ 557 if (IS_ENABLED(CONFIG_LOCKDEP)) { 558 WARN_ON_ONCE(ev->sched_switch); 559 ev->events = 0; 560 } 561} 562 563static __always_inline void rseq_irqentry_exit_to_user_mode(void) 564{ 565 struct rseq_event *ev = &current->rseq.event; 566 567 rseq_stat_inc(rseq_stats.exit); 568 569 lockdep_assert_once(!ev->sched_switch); 570 571 /* 572 * Ensure that event (especially user_irq) is cleared when the 573 * interrupt did not result in a schedule and therefore the 574 * rseq processing could not clear it. 575 */ 576 ev->events = 0; 577} 578 579/* Required to keep ARM64 working */ 580static __always_inline void rseq_exit_to_user_mode_legacy(void) 581{ 582 struct rseq_event *ev = &current->rseq.event; 583 584 rseq_stat_inc(rseq_stats.exit); 585 586 if (static_branch_unlikely(&rseq_debug_enabled)) 587 WARN_ON_ONCE(ev->sched_switch); 588 589 /* 590 * Ensure that event (especially user_irq) is cleared when the 591 * interrupt did not result in a schedule and therefore the 592 * rseq processing did not clear it. 593 */ 594 ev->events = 0; 595} 596 597void __rseq_debug_syscall_return(struct pt_regs *regs); 598 599static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs) 600{ 601 if (static_branch_unlikely(&rseq_debug_enabled)) 602 __rseq_debug_syscall_return(regs); 603} 604#else /* CONFIG_RSEQ */ 605static inline void rseq_note_user_irq_entry(void) { } 606static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 607{ 608 return false; 609} 610static inline void rseq_syscall_exit_to_user_mode(void) { } 611static inline void rseq_irqentry_exit_to_user_mode(void) { } 612static inline void rseq_exit_to_user_mode_legacy(void) { } 613static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } 614#endif /* !CONFIG_RSEQ */ 615 616#endif /* _LINUX_RSEQ_ENTRY_H */