Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rseq: slice ext: Ensure rseq feature size differs from original rseq size

Before rseq became extensible, its original size was 32 bytes even
though the active rseq area was only 20 bytes. This had the following
impact in terms of userspace ecosystem evolution:

* The GNU libc between 2.35 and 2.39 expose a __rseq_size symbol set
to 32, even though the size of the active rseq area is really 20.
* The GNU libc 2.40 changes this __rseq_size to 20, thus making it
express the active rseq area.
* Starting from glibc 2.41, __rseq_size corresponds to the
AT_RSEQ_FEATURE_SIZE from getauxval(3).

This means that users of __rseq_size can always expect it to
correspond to the active rseq area, except for the value 32, for
which the active rseq area is 20 bytes.

Exposing a 32 bytes feature size would make life needlessly painful
for userspace. Therefore, add a reserved field at the end of the
rseq area to bump the feature size to 33 bytes. This reserved field
is expected to be replaced with whatever field will come next,
expecting that this field will be larger than 1 byte.

The effect of this change is to increase the size from 32 to 64 bytes
before we actually have fields using that memory.

Clarify the allocation size and alignment requirements in the struct
rseq uapi comment.

Change the value returned by getauxval(AT_RSEQ_ALIGN) to return the
value of the active rseq area size rounded up to next power of 2, which
guarantees that the rseq structure will always be aligned on the nearest
power of two large enough to contain it, even as it grows. Change the
alignment check in the rseq registration accordingly.

This will minimize the amount of ABI corner-cases we need to document
and require userspace to play games with. The rule stays simple when
__rseq_size != 32:

#define rseq_field_available(field) (__rseq_size >= offsetofend(struct rseq_abi, field))

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260220200642.1317826-3-mathieu.desnoyers@efficios.com

authored by

Mathieu Desnoyers and committed by
Peter Zijlstra
3b68df97 26d43a90

+38 -6
+2 -1
fs/binfmt_elf.c
··· 47 47 #include <linux/dax.h> 48 48 #include <linux/uaccess.h> 49 49 #include <uapi/linux/rseq.h> 50 + #include <linux/rseq.h> 50 51 #include <asm/param.h> 51 52 #include <asm/page.h> 52 53 ··· 287 286 } 288 287 #ifdef CONFIG_RSEQ 289 288 NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end)); 290 - NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq)); 289 + NEW_AUX_ENT(AT_RSEQ_ALIGN, rseq_alloc_align()); 291 290 #endif 292 291 #undef NEW_AUX_ENT 293 292 /* AT_NULL is zero; clear the rest too */
+12
include/linux/rseq.h
··· 146 146 t->rseq = current->rseq; 147 147 } 148 148 149 + /* 150 + * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq 151 + * registration. This is the active rseq area size rounded up to next 152 + * power of 2, which guarantees that the rseq structure will always be 153 + * aligned on the nearest power of two large enough to contain it, even 154 + * as it grows. 155 + */ 156 + static inline unsigned int rseq_alloc_align(void) 157 + { 158 + return 1U << get_count_order(offsetof(struct rseq, end)); 159 + } 160 + 149 161 #else /* CONFIG_RSEQ */ 150 162 static inline void rseq_handle_slowpath(struct pt_regs *regs) { } 151 163 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
+22 -4
include/uapi/linux/rseq.h
··· 87 87 }; 88 88 89 89 /* 90 - * struct rseq is aligned on 4 * 8 bytes to ensure it is always 91 - * contained within a single cache-line. 90 + * The original size and alignment of the allocation for struct rseq is 91 + * 32 bytes. 92 92 * 93 - * A single struct rseq per thread is allowed. 93 + * The allocation size needs to be greater or equal to 94 + * max(getauxval(AT_RSEQ_FEATURE_SIZE), 32), and the allocation needs to 95 + * be aligned on max(getauxval(AT_RSEQ_ALIGN), 32). 96 + * 97 + * As an alternative, userspace is allowed to use both the original size 98 + * and alignment of 32 bytes for backward compatibility. 99 + * 100 + * A single active struct rseq registration per thread is allowed. 94 101 */ 95 102 struct rseq { 96 103 /* ··· 188 181 struct rseq_slice_ctrl slice_ctrl; 189 182 190 183 /* 184 + * Before rseq became extensible, its original size was 32 bytes even 185 + * though the active rseq area was only 20 bytes. 186 + * Exposing a 32 bytes feature size would make life needlessly painful 187 + * for userspace. Therefore, add a reserved byte after byte 32 188 + * to bump the rseq feature size from 32 to 33. 189 + * The next field to be added to the rseq area will be larger 190 + * than one byte, and will replace this reserved byte. 191 + */ 192 + __u8 __reserved; 193 + 194 + /* 191 195 * Flexible array member at end of structure, after last feature field. 192 196 */ 193 197 char end[]; 194 - } __attribute__((aligned(4 * sizeof(__u64)))); 198 + } __attribute__((aligned(32))); 195 199 196 200 #endif /* _UAPI_LINUX_RSEQ_H */
+2 -1
kernel/rseq.c
··· 80 80 #include <linux/syscalls.h> 81 81 #include <linux/uaccess.h> 82 82 #include <linux/types.h> 83 + #include <linux/rseq.h> 83 84 #include <asm/ptrace.h> 84 85 85 86 #define CREATE_TRACE_POINTS ··· 457 456 */ 458 457 if (rseq_len < ORIG_RSEQ_SIZE || 459 458 (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || 460 - (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || 459 + (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) || 461 460 rseq_len < offsetof(struct rseq, end)))) 462 461 return -EINVAL; 463 462 if (!access_ok(rseq, rseq_len))