Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4 */
5
6#include <linux/cache.h>
7#include <linux/kernel.h>
8#include <linux/time64.h>
9#include <vdso/datapage.h>
10#include <vdso/getrandom.h>
11#include <asm/vdso/getrandom.h>
12#include <asm/vdso/vsyscall.h>
13#include <asm/unaligned.h>
14#include <uapi/linux/mman.h>
15
16#define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \
17 while (len >= sizeof(type)) { \
18 __put_unaligned_t(type, __get_unaligned_t(type, src), dst); \
19 __put_unaligned_t(type, 0, src); \
20 dst += sizeof(type); \
21 src += sizeof(type); \
22 len -= sizeof(type); \
23 } \
24} while (0)
25
26static void memcpy_and_zero_src(void *dst, void *src, size_t len)
27{
28 if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
29 if (IS_ENABLED(CONFIG_64BIT))
30 MEMCPY_AND_ZERO_SRC(u64, dst, src, len);
31 MEMCPY_AND_ZERO_SRC(u32, dst, src, len);
32 MEMCPY_AND_ZERO_SRC(u16, dst, src, len);
33 }
34 MEMCPY_AND_ZERO_SRC(u8, dst, src, len);
35}
36
37/**
38 * __cvdso_getrandom_data - Generic vDSO implementation of getrandom() syscall.
39 * @rng_info: Describes state of kernel RNG, memory shared with kernel.
40 * @buffer: Destination buffer to fill with random bytes.
41 * @len: Size of @buffer in bytes.
42 * @flags: Zero or more GRND_* flags.
43 * @opaque_state: Pointer to an opaque state area.
44 * @opaque_len: Length of opaque state area.
45 *
46 * This implements a "fast key erasure" RNG using ChaCha20, in the same way that the kernel's
47 * getrandom() syscall does. It periodically reseeds its key from the kernel's RNG, at the same
48 * schedule that the kernel's RNG is reseeded. If the kernel's RNG is not ready, then this always
49 * calls into the syscall.
50 *
51 * If @buffer, @len, and @flags are 0, and @opaque_len is ~0UL, then @opaque_state is populated
52 * with a struct vgetrandom_opaque_params and the function returns 0; if it does not return 0,
53 * this function should not be used.
54 *
55 * @opaque_state *must* be allocated by calling mmap(2) using the mmap_prot and mmap_flags fields
56 * from the struct vgetrandom_opaque_params, and states must not straddle pages. Unless external
57 * locking is used, one state must be allocated per thread, as it is not safe to call this function
58 * concurrently with the same @opaque_state. However, it is safe to call this using the same
59 * @opaque_state that is shared between main code and signal handling code, within the same thread.
60 *
61 * Returns: The number of random bytes written to @buffer, or a negative value indicating an error.
62 */
63static __always_inline ssize_t
64__cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_t len,
65 unsigned int flags, void *opaque_state, size_t opaque_len)
66{
67 ssize_t ret = min_t(size_t, INT_MAX & PAGE_MASK /* = MAX_RW_COUNT */, len);
68 struct vgetrandom_state *state = opaque_state;
69 size_t batch_len, nblocks, orig_len = len;
70 bool in_use, have_retried = false;
71 unsigned long current_generation;
72 void *orig_buffer = buffer;
73 u32 counter[2] = { 0 };
74
75 if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) {
76 *(struct vgetrandom_opaque_params *)opaque_state = (struct vgetrandom_opaque_params) {
77 .size_of_opaque_state = sizeof(*state),
78 .mmap_prot = PROT_READ | PROT_WRITE,
79 .mmap_flags = MAP_DROPPABLE | MAP_ANONYMOUS
80 };
81 return 0;
82 }
83
84 /* The state must not straddle a page, since pages can be zeroed at any time. */
85 if (unlikely(((unsigned long)opaque_state & ~PAGE_MASK) + sizeof(*state) > PAGE_SIZE))
86 return -EFAULT;
87
88 /* If the caller passes the wrong size, which might happen due to CRIU, fallback. */
89 if (unlikely(opaque_len != sizeof(*state)))
90 goto fallback_syscall;
91
92 /*
93 * If the kernel's RNG is not yet ready, then it's not possible to provide random bytes from
94 * userspace, because A) the various @flags require this to block, or not, depending on
95 * various factors unavailable to userspace, and B) the kernel's behavior before the RNG is
96 * ready is to reseed from the entropy pool at every invocation.
97 */
98 if (unlikely(!READ_ONCE(rng_info->is_ready)))
99 goto fallback_syscall;
100
101 /*
102 * This condition is checked after @rng_info->is_ready, because before the kernel's RNG is
103 * initialized, the @flags parameter may require this to block or return an error, even when
104 * len is zero.
105 */
106 if (unlikely(!len))
107 return 0;
108
109 /*
110 * @state->in_use is basic reentrancy protection against this running in a signal handler
111 * with the same @opaque_state, but obviously not atomic wrt multiple CPUs or more than one
112 * level of reentrancy. If a signal interrupts this after reading @state->in_use, but before
113 * writing @state->in_use, there is still no race, because the signal handler will run to
114 * its completion before returning execution.
115 */
116 in_use = READ_ONCE(state->in_use);
117 if (unlikely(in_use))
118 /* The syscall simply fills the buffer and does not touch @state, so fallback. */
119 goto fallback_syscall;
120 WRITE_ONCE(state->in_use, true);
121
122retry_generation:
123 /*
124 * @rng_info->generation must always be read here, as it serializes @state->key with the
125 * kernel's RNG reseeding schedule.
126 */
127 current_generation = READ_ONCE(rng_info->generation);
128
129 /*
130 * If @state->generation doesn't match the kernel RNG's generation, then it means the
131 * kernel's RNG has reseeded, and so @state->key is reseeded as well.
132 */
133 if (unlikely(state->generation != current_generation)) {
134 /*
135 * Write the generation before filling the key, in case of fork. If there is a fork
136 * just after this line, the parent and child will get different random bytes from
137 * the syscall, which is good. However, were this line to occur after the getrandom
138 * syscall, then both child and parent could have the same bytes and the same
139 * generation counter, so the fork would not be detected. Therefore, write
140 * @state->generation before the call to the getrandom syscall.
141 */
142 WRITE_ONCE(state->generation, current_generation);
143
144 /*
145 * Prevent the syscall from being reordered wrt current_generation. Pairs with the
146 * smp_store_release(&_vdso_rng_data.generation) in random.c.
147 */
148 smp_rmb();
149
150 /* Reseed @state->key using fresh bytes from the kernel. */
151 if (getrandom_syscall(state->key, sizeof(state->key), 0) != sizeof(state->key)) {
152 /*
153 * If the syscall failed to refresh the key, then @state->key is now
154 * invalid, so invalidate the generation so that it is not used again, and
155 * fallback to using the syscall entirely.
156 */
157 WRITE_ONCE(state->generation, 0);
158
159 /*
160 * Set @state->in_use to false only after the last write to @state in the
161 * line above.
162 */
163 WRITE_ONCE(state->in_use, false);
164
165 goto fallback_syscall;
166 }
167
168 /*
169 * Set @state->pos to beyond the end of the batch, so that the batch is refilled
170 * using the new key.
171 */
172 state->pos = sizeof(state->batch);
173 }
174
175 /* Set len to the total amount of bytes that this function is allowed to read, ret. */
176 len = ret;
177more_batch:
178 /*
179 * First use bytes out of @state->batch, which may have been filled by the last call to this
180 * function.
181 */
182 batch_len = min_t(size_t, sizeof(state->batch) - state->pos, len);
183 if (batch_len) {
184 /* Zeroing at the same time as memcpying helps preserve forward secrecy. */
185 memcpy_and_zero_src(buffer, state->batch + state->pos, batch_len);
186 state->pos += batch_len;
187 buffer += batch_len;
188 len -= batch_len;
189 }
190
191 if (!len) {
192 /* Prevent the loop from being reordered wrt ->generation. */
193 barrier();
194
195 /*
196 * Since @rng_info->generation will never be 0, re-read @state->generation, rather
197 * than using the local current_generation variable, to learn whether a fork
198 * occurred or if @state was zeroed due to memory pressure. Primarily, though, this
199 * indicates whether the kernel's RNG has reseeded, in which case generate a new key
200 * and start over.
201 */
202 if (unlikely(READ_ONCE(state->generation) != READ_ONCE(rng_info->generation))) {
203 /*
204 * Prevent this from looping forever in case of low memory or racing with a
205 * user force-reseeding the kernel's RNG using the ioctl.
206 */
207 if (have_retried) {
208 WRITE_ONCE(state->in_use, false);
209 goto fallback_syscall;
210 }
211
212 have_retried = true;
213 buffer = orig_buffer;
214 goto retry_generation;
215 }
216
217 /*
218 * Set @state->in_use to false only when there will be no more reads or writes of
219 * @state.
220 */
221 WRITE_ONCE(state->in_use, false);
222 return ret;
223 }
224
225 /* Generate blocks of RNG output directly into @buffer while there's enough room left. */
226 nblocks = len / CHACHA_BLOCK_SIZE;
227 if (nblocks) {
228 __arch_chacha20_blocks_nostack(buffer, state->key, counter, nblocks);
229 buffer += nblocks * CHACHA_BLOCK_SIZE;
230 len -= nblocks * CHACHA_BLOCK_SIZE;
231 }
232
233 BUILD_BUG_ON(sizeof(state->batch_key) % CHACHA_BLOCK_SIZE != 0);
234
235 /* Refill the batch and overwrite the key, in order to preserve forward secrecy. */
236 __arch_chacha20_blocks_nostack(state->batch_key, state->key, counter,
237 sizeof(state->batch_key) / CHACHA_BLOCK_SIZE);
238
239 /* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */
240 state->pos = 0;
241 goto more_batch;
242
243fallback_syscall:
244 return getrandom_syscall(orig_buffer, orig_len, flags);
245}
246
247static __always_inline ssize_t
248__cvdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
249{
250 return __cvdso_getrandom_data(__arch_get_vdso_rng_data(), buffer, len, flags, opaque_state, opaque_len);
251}