Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: LGPL-2.1
2/*
3 * rseq.c
4 *
5 * Copyright (C) 2016 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; only
10 * version 2.1 of the License.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 */
17
18#define _GNU_SOURCE
19#include <errno.h>
20#include <sched.h>
21#include <stdio.h>
22#include <stdlib.h>
23#include <string.h>
24#include <unistd.h>
25#include <syscall.h>
26#include <assert.h>
27#include <signal.h>
28#include <limits.h>
29#include <dlfcn.h>
30#include <stddef.h>
31#include <sys/auxv.h>
32#include <linux/auxvec.h>
33
34#include <linux/compiler.h>
35
36#include "kselftest.h"
37#include "rseq.h"
38
39/*
40 * Define weak versions to play nice with binaries that are statically linked
41 * against a libc that doesn't support registering its own rseq.
42 */
43extern __weak ptrdiff_t __rseq_offset;
44extern __weak unsigned int __rseq_size;
45extern __weak unsigned int __rseq_flags;
46
47static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset;
48static const unsigned int *libc_rseq_size_p = &__rseq_size;
49static const unsigned int *libc_rseq_flags_p = &__rseq_flags;
50
51/* Offset from the thread pointer to the rseq area. */
52ptrdiff_t rseq_offset;
53
54/*
55 * Size of the registered rseq area. 0 if the registration was
56 * unsuccessful.
57 */
58unsigned int rseq_size = -1U;
59
60/* Flags used during rseq registration. */
61unsigned int rseq_flags;
62
63static int rseq_ownership;
64
65/* Allocate a large area for the TLS. */
66#define RSEQ_THREAD_AREA_ALLOC_SIZE 1024
67
68/* Original struct rseq feature size is 20 bytes. */
69#define ORIG_RSEQ_FEATURE_SIZE 20
70
71/* Original struct rseq allocation size is 32 bytes. */
72#define ORIG_RSEQ_ALLOC_SIZE 32
73
74/*
75 * Use a union to ensure we allocate a TLS area of 1024 bytes to accomodate an
76 * rseq registration that is larger than the current rseq ABI.
77 */
78union rseq_tls {
79 struct rseq_abi abi;
80 char dummy[RSEQ_THREAD_AREA_ALLOC_SIZE];
81};
82
83static
84__thread union rseq_tls __rseq __attribute__((tls_model("initial-exec"))) = {
85 .abi = {
86 .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED,
87 },
88};
89
90static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len,
91 int flags, uint32_t sig)
92{
93 return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
94}
95
96static int sys_getcpu(unsigned *cpu, unsigned *node)
97{
98 return syscall(__NR_getcpu, cpu, node, NULL);
99}
100
101bool rseq_available(void)
102{
103 int rc;
104
105 rc = sys_rseq(NULL, 0, 0, 0);
106 if (rc != -1)
107 abort();
108 switch (errno) {
109 case ENOSYS:
110 return false;
111 case EINVAL:
112 return true;
113 default:
114 abort();
115 }
116}
117
118/* The rseq areas need to be at least 32 bytes. */
119static
120unsigned int get_rseq_min_alloc_size(void)
121{
122 unsigned int alloc_size = rseq_size;
123
124 if (alloc_size < ORIG_RSEQ_ALLOC_SIZE)
125 alloc_size = ORIG_RSEQ_ALLOC_SIZE;
126 return alloc_size;
127}
128
129/*
130 * Return the feature size supported by the kernel.
131 *
132 * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE):
133 *
134 * 0: Return ORIG_RSEQ_FEATURE_SIZE (20)
135 * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE).
136 *
137 * It should never return a value below ORIG_RSEQ_FEATURE_SIZE.
138 */
139static
140unsigned int get_rseq_kernel_feature_size(void)
141{
142 unsigned long auxv_rseq_feature_size, auxv_rseq_align;
143
144 auxv_rseq_align = getauxval(AT_RSEQ_ALIGN);
145 assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE);
146
147 auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE);
148 assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE);
149 if (auxv_rseq_feature_size)
150 return auxv_rseq_feature_size;
151 else
152 return ORIG_RSEQ_FEATURE_SIZE;
153}
154
155int rseq_register_current_thread(void)
156{
157 int rc;
158
159 if (!rseq_ownership) {
160 /* Treat libc's ownership as a successful registration. */
161 return 0;
162 }
163 rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG);
164 if (rc) {
165 /*
166 * After at least one thread has registered successfully
167 * (rseq_size > 0), the registration of other threads should
168 * never fail.
169 */
170 if (RSEQ_READ_ONCE(rseq_size) > 0) {
171 /* Incoherent success/failure within process. */
172 abort();
173 }
174 return -1;
175 }
176 assert(rseq_current_cpu_raw() >= 0);
177
178 /*
179 * The first thread to register sets the rseq_size to mimic the libc
180 * behavior.
181 */
182 if (RSEQ_READ_ONCE(rseq_size) == 0) {
183 RSEQ_WRITE_ONCE(rseq_size, get_rseq_kernel_feature_size());
184 }
185
186 return 0;
187}
188
189int rseq_unregister_current_thread(void)
190{
191 int rc;
192
193 if (!rseq_ownership) {
194 /* Treat libc's ownership as a successful unregistration. */
195 return 0;
196 }
197 rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
198 if (rc)
199 return -1;
200 return 0;
201}
202
203static __attribute__((constructor))
204void rseq_init(void)
205{
206 /*
207 * If the libc's registered rseq size isn't already valid, it may be
208 * because the binary is dynamically linked and not necessarily due to
209 * libc not having registered a restartable sequence. Try to find the
210 * symbols if that's the case.
211 */
212 if (!libc_rseq_size_p || !*libc_rseq_size_p) {
213 libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
214 libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
215 libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
216 }
217 if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
218 *libc_rseq_size_p != 0) {
219 unsigned int libc_rseq_size;
220
221 /* rseq registration owned by glibc */
222 rseq_offset = *libc_rseq_offset_p;
223 libc_rseq_size = *libc_rseq_size_p;
224 rseq_flags = *libc_rseq_flags_p;
225
226 /*
227 * Previous versions of glibc expose the value
228 * 32 even though the kernel only supported 20
229 * bytes initially. Therefore treat 32 as a
230 * special-case. glibc 2.40 exposes a 20 bytes
231 * __rseq_size without using getauxval(3) to
232 * query the supported size, while still allocating a 32
233 * bytes area. Also treat 20 as a special-case.
234 *
235 * Special-cases are handled by using the following
236 * value as active feature set size:
237 *
238 * rseq_size = min(32, get_rseq_kernel_feature_size())
239 */
240 switch (libc_rseq_size) {
241 case ORIG_RSEQ_FEATURE_SIZE:
242 fallthrough;
243 case ORIG_RSEQ_ALLOC_SIZE:
244 {
245 unsigned int rseq_kernel_feature_size = get_rseq_kernel_feature_size();
246
247 if (rseq_kernel_feature_size < ORIG_RSEQ_ALLOC_SIZE)
248 rseq_size = rseq_kernel_feature_size;
249 else
250 rseq_size = ORIG_RSEQ_ALLOC_SIZE;
251 break;
252 }
253 default:
254 /* Otherwise just use the __rseq_size from libc as rseq_size. */
255 rseq_size = libc_rseq_size;
256 break;
257 }
258 return;
259 }
260 rseq_ownership = 1;
261
262 /* Calculate the offset of the rseq area from the thread pointer. */
263 rseq_offset = (void *)&__rseq.abi - rseq_thread_pointer();
264
265 /* rseq flags are deprecated, always set to 0. */
266 rseq_flags = 0;
267
268 /*
269 * Set the size to 0 until at least one thread registers to mimic the
270 * libc behavior.
271 */
272 rseq_size = 0;
273}
274
275static __attribute__((destructor))
276void rseq_exit(void)
277{
278 if (!rseq_ownership)
279 return;
280 rseq_offset = 0;
281 rseq_size = -1U;
282 rseq_ownership = 0;
283}
284
285int32_t rseq_fallback_current_cpu(void)
286{
287 int32_t cpu;
288
289 cpu = sched_getcpu();
290 if (cpu < 0) {
291 perror("sched_getcpu()");
292 abort();
293 }
294 return cpu;
295}
296
297int32_t rseq_fallback_current_node(void)
298{
299 uint32_t cpu_id, node_id;
300 int ret;
301
302 ret = sys_getcpu(&cpu_id, &node_id);
303 if (ret) {
304 perror("sys_getcpu()");
305 return ret;
306 }
307 return (int32_t) node_id;
308}