Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

arm64: vDSO: Wire up getrandom() vDSO implementation

Hook up the generic vDSO implementation to the aarch64 vDSO data page.
The _vdso_rng_data required data is placed within the _vdso_data vvar
page, by using a offset larger than the vdso_data.

The vDSO function requires a ChaCha20 implementation that does not write
to the stack, and that can do an entire ChaCha20 permutation. The one
provided uses NEON on the permute operation, with a fallback to the
syscall for chips that do not support AdvSIMD.

This also passes the vdso_test_chacha test along with
vdso_test_getrandom. The vdso_test_getrandom bench-single result on
Neoverse-N1 shows:

vdso: 25000000 times in 0.783884250 seconds
libc: 25000000 times in 8.780275399 seconds
syscall: 25000000 times in 8.786581518 seconds

A small fixup to arch/arm64/include/asm/mman.h was required to avoid
pulling kernel code into the vDSO, similar to what's already done in
arch/arm64/include/asm/rwonce.h.

Signed-off-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>

authored by

Adhemerval Zanella and committed by
Jason A. Donenfeld
712676ea 2c2ca341

+285 -16
+1
arch/arm64/Kconfig
··· 262 262 select TRACE_IRQFLAGS_NMI_SUPPORT 263 263 select HAVE_SOFTIRQ_ON_OWN_STACK 264 264 select USER_STACKTRACE_SUPPORT 265 + select VDSO_GETRANDOM 265 266 help 266 267 ARM 64-bit (AArch64) Linux support. 267 268
+5 -1
arch/arm64/include/asm/mman.h
··· 2 2 #ifndef __ASM_MMAN_H__ 3 3 #define __ASM_MMAN_H__ 4 4 5 + #include <uapi/asm/mman.h> 6 + 7 + #ifndef BUILD_VDSO 5 8 #include <linux/compiler.h> 6 9 #include <linux/types.h> 7 - #include <uapi/asm/mman.h> 8 10 9 11 static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, 10 12 unsigned long pkey __always_unused) ··· 61 59 return !(vm_flags & VM_MTE) || (vm_flags & VM_MTE_ALLOWED); 62 60 } 63 61 #define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags) 62 + 63 + #endif /* !BUILD_VDSO */ 64 64 65 65 #endif /* ! __ASM_MMAN_H__ */
+50
arch/arm64/include/asm/vdso/getrandom.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #ifndef __ASM_VDSO_GETRANDOM_H 4 + #define __ASM_VDSO_GETRANDOM_H 5 + 6 + #ifndef __ASSEMBLY__ 7 + 8 + #include <asm/unistd.h> 9 + #include <asm/vdso/vsyscall.h> 10 + #include <vdso/datapage.h> 11 + 12 + /** 13 + * getrandom_syscall - Invoke the getrandom() syscall. 14 + * @buffer: Destination buffer to fill with random bytes. 15 + * @len: Size of @buffer in bytes. 16 + * @flags: Zero or more GRND_* flags. 17 + * Returns: The number of random bytes written to @buffer, or a negative value indicating an error. 18 + */ 19 + static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags) 20 + { 21 + register void *buffer asm ("x0") = _buffer; 22 + register size_t len asm ("x1") = _len; 23 + register unsigned int flags asm ("x2") = _flags; 24 + register long ret asm ("x0"); 25 + register long nr asm ("x8") = __NR_getrandom; 26 + 27 + asm volatile( 28 + " svc #0\n" 29 + : "=r" (ret) 30 + : "r" (buffer), "r" (len), "r" (flags), "r" (nr) 31 + : "memory"); 32 + 33 + return ret; 34 + } 35 + 36 + static __always_inline const struct vdso_rng_data *__arch_get_vdso_rng_data(void) 37 + { 38 + /* 39 + * The RNG data is in the real VVAR data page, but if a task belongs to a time namespace 40 + * then VVAR_DATA_PAGE_OFFSET points to the namespace-specific VVAR page and VVAR_TIMENS_ 41 + * PAGE_OFFSET points to the real VVAR page. 42 + */ 43 + if (IS_ENABLED(CONFIG_TIME_NS) && _vdso_data->clock_mode == VDSO_CLOCKMODE_TIMENS) 44 + return (void *)&_vdso_rng_data + VVAR_TIMENS_PAGE_OFFSET * (1UL << CONFIG_PAGE_SHIFT); 45 + return &_vdso_rng_data; 46 + } 47 + 48 + #endif /* !__ASSEMBLY__ */ 49 + 50 + #endif /* __ASM_VDSO_GETRANDOM_H */
+15
arch/arm64/include/asm/vdso/vsyscall.h
··· 2 2 #ifndef __ASM_VDSO_VSYSCALL_H 3 3 #define __ASM_VDSO_VSYSCALL_H 4 4 5 + #define __VDSO_RND_DATA_OFFSET 480 6 + 5 7 #ifndef __ASSEMBLY__ 6 8 7 9 #include <linux/timekeeper_internal.h> 8 10 #include <vdso/datapage.h> 11 + 12 + enum vvar_pages { 13 + VVAR_DATA_PAGE_OFFSET, 14 + VVAR_TIMENS_PAGE_OFFSET, 15 + VVAR_NR_PAGES, 16 + }; 9 17 10 18 #define VDSO_PRECISION_MASK ~(0xFF00ULL<<48) 11 19 ··· 28 20 return vdso_data; 29 21 } 30 22 #define __arch_get_k_vdso_data __arm64_get_k_vdso_data 23 + 24 + static __always_inline 25 + struct vdso_rng_data *__arm64_get_k_vdso_rnd_data(void) 26 + { 27 + return (void *)vdso_data + __VDSO_RND_DATA_OFFSET; 28 + } 29 + #define __arch_get_k_vdso_rng_data __arm64_get_k_vdso_rnd_data 31 30 32 31 static __always_inline 33 32 void __arm64_update_vsyscall(struct vdso_data *vdata, struct timekeeper *tk)
-6
arch/arm64/kernel/vdso.c
··· 34 34 VDSO_ABI_AA32, 35 35 }; 36 36 37 - enum vvar_pages { 38 - VVAR_DATA_PAGE_OFFSET, 39 - VVAR_TIMENS_PAGE_OFFSET, 40 - VVAR_NR_PAGES, 41 - }; 42 - 43 37 struct vdso_abi_info { 44 38 const char *name; 45 39 const char *vdso_code_start;
+17 -8
arch/arm64/kernel/vdso/Makefile
··· 9 9 # Include the generic Makefile to check the built vdso. 10 10 include $(srctree)/lib/vdso/Makefile 11 11 12 - obj-vdso := vgettimeofday.o note.o sigreturn.o 12 + obj-vdso := vgettimeofday.o note.o sigreturn.o vgetrandom.o vgetrandom-chacha.o 13 13 14 14 # Build rules 15 15 targets := $(obj-vdso) vdso.so vdso.so.dbg ··· 34 34 ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO 35 35 36 36 # -Wmissing-prototypes and -Wmissing-declarations are removed from 37 - # the CFLAGS of vgettimeofday.c to make possible to build the 38 - # kernel with CONFIG_WERROR enabled. 39 - CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ 40 - $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \ 41 - $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \ 42 - -Wmissing-prototypes -Wmissing-declarations 37 + # the CFLAGS to make possible to build the kernel with CONFIG_WERROR enabled. 38 + CC_FLAGS_REMOVE_VDSO := $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \ 39 + $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \ 40 + $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \ 41 + -Wmissing-prototypes -Wmissing-declarations 43 42 44 - CFLAGS_vgettimeofday.o = -O2 -mcmodel=tiny -fasynchronous-unwind-tables 43 + CC_FLAGS_ADD_VDSO := -O2 -mcmodel=tiny -fasynchronous-unwind-tables 44 + 45 + CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_REMOVE_VDSO) 46 + CFLAGS_REMOVE_vgetrandom.o = $(CC_FLAGS_REMOVE_VDSO) 47 + 48 + CFLAGS_vgettimeofday.o = $(CC_FLAGS_ADD_VDSO) 49 + CFLAGS_vgetrandom.o = $(CC_FLAGS_ADD_VDSO) 45 50 46 51 ifneq ($(c-gettimeofday-y),) 47 52 CFLAGS_vgettimeofday.o += -include $(c-gettimeofday-y) 53 + endif 54 + 55 + ifneq ($(c-getrandom-y),) 56 + CFLAGS_vgetrandom.o += -include $(c-getrandom-y) 48 57 endif 49 58 50 59 targets += vdso.lds
+4
arch/arm64/kernel/vdso/vdso.lds.S
··· 11 11 #include <linux/const.h> 12 12 #include <asm/page.h> 13 13 #include <asm/vdso.h> 14 + #include <asm/vdso/vsyscall.h> 14 15 #include <asm-generic/vmlinux.lds.h> 16 + #include <vdso/datapage.h> 15 17 16 18 OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64") 17 19 OUTPUT_ARCH(aarch64) ··· 21 19 SECTIONS 22 20 { 23 21 PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); 22 + PROVIDE(_vdso_rng_data = _vdso_data + __VDSO_RND_DATA_OFFSET); 24 23 #ifdef CONFIG_TIME_NS 25 24 PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); 26 25 #endif ··· 105 102 __kernel_gettimeofday; 106 103 __kernel_clock_gettime; 107 104 __kernel_clock_getres; 105 + __kernel_getrandom; 108 106 local: *; 109 107 }; 110 108 }
+172
arch/arm64/kernel/vdso/vgetrandom-chacha.S
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/linkage.h> 4 + #include <asm/cache.h> 5 + #include <asm/assembler.h> 6 + 7 + .text 8 + 9 + #define state0 v0 10 + #define state1 v1 11 + #define state2 v2 12 + #define state3 v3 13 + #define copy0 v4 14 + #define copy0_q q4 15 + #define copy1 v5 16 + #define copy2 v6 17 + #define copy3 v7 18 + #define copy3_d d7 19 + #define one_d d16 20 + #define one_q q16 21 + #define one_v v16 22 + #define tmp v17 23 + #define rot8 v18 24 + 25 + /* 26 + * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive 27 + * number of blocks of output with nonce 0, taking an input key and 8-bytes 28 + * counter. Importantly does not spill to the stack. 29 + * 30 + * This implementation avoids d8-d15 because they are callee-save in user 31 + * space. 32 + * 33 + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes, 34 + * const uint8_t *key, 35 + * uint32_t *counter, 36 + * size_t nblocks) 37 + * 38 + * x0: output bytes 39 + * x1: 32-byte key input 40 + * x2: 8-byte counter input/output 41 + * x3: number of 64-byte block to write to output 42 + */ 43 + SYM_FUNC_START(__arch_chacha20_blocks_nostack) 44 + 45 + /* copy0 = "expand 32-byte k" */ 46 + mov_q x8, 0x3320646e61707865 47 + mov_q x9, 0x6b20657479622d32 48 + mov copy0.d[0], x8 49 + mov copy0.d[1], x9 50 + 51 + /* copy1,copy2 = key */ 52 + ld1 { copy1.4s, copy2.4s }, [x1] 53 + /* copy3 = counter || zero nonce */ 54 + ld1 { copy3.2s }, [x2] 55 + 56 + movi one_v.2s, #1 57 + uzp1 one_v.4s, one_v.4s, one_v.4s 58 + 59 + .Lblock: 60 + /* copy state to auxiliary vectors for the final add after the permute. */ 61 + mov state0.16b, copy0.16b 62 + mov state1.16b, copy1.16b 63 + mov state2.16b, copy2.16b 64 + mov state3.16b, copy3.16b 65 + 66 + mov w4, 20 67 + .Lpermute: 68 + /* 69 + * Permute one 64-byte block where the state matrix is stored in the four NEON 70 + * registers state0-state3. It performs matrix operations on four words in parallel, 71 + * but requires shuffling to rearrange the words after each round. 72 + */ 73 + 74 + .Ldoubleround: 75 + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 76 + add state0.4s, state0.4s, state1.4s 77 + eor state3.16b, state3.16b, state0.16b 78 + rev32 state3.8h, state3.8h 79 + 80 + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 81 + add state2.4s, state2.4s, state3.4s 82 + eor tmp.16b, state1.16b, state2.16b 83 + shl state1.4s, tmp.4s, #12 84 + sri state1.4s, tmp.4s, #20 85 + 86 + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 87 + add state0.4s, state0.4s, state1.4s 88 + eor tmp.16b, state3.16b, state0.16b 89 + shl state3.4s, tmp.4s, #8 90 + sri state3.4s, tmp.4s, #24 91 + 92 + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 93 + add state2.4s, state2.4s, state3.4s 94 + eor tmp.16b, state1.16b, state2.16b 95 + shl state1.4s, tmp.4s, #7 96 + sri state1.4s, tmp.4s, #25 97 + 98 + /* state1[0,1,2,3] = state1[1,2,3,0] */ 99 + ext state1.16b, state1.16b, state1.16b, #4 100 + /* state2[0,1,2,3] = state2[2,3,0,1] */ 101 + ext state2.16b, state2.16b, state2.16b, #8 102 + /* state3[0,1,2,3] = state3[1,2,3,0] */ 103 + ext state3.16b, state3.16b, state3.16b, #12 104 + 105 + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ 106 + add state0.4s, state0.4s, state1.4s 107 + eor state3.16b, state3.16b, state0.16b 108 + rev32 state3.8h, state3.8h 109 + 110 + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ 111 + add state2.4s, state2.4s, state3.4s 112 + eor tmp.16b, state1.16b, state2.16b 113 + shl state1.4s, tmp.4s, #12 114 + sri state1.4s, tmp.4s, #20 115 + 116 + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ 117 + add state0.4s, state0.4s, state1.4s 118 + eor tmp.16b, state3.16b, state0.16b 119 + shl state3.4s, tmp.4s, #8 120 + sri state3.4s, tmp.4s, #24 121 + 122 + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ 123 + add state2.4s, state2.4s, state3.4s 124 + eor tmp.16b, state1.16b, state2.16b 125 + shl state1.4s, tmp.4s, #7 126 + sri state1.4s, tmp.4s, #25 127 + 128 + /* state1[0,1,2,3] = state1[3,0,1,2] */ 129 + ext state1.16b, state1.16b, state1.16b, #12 130 + /* state2[0,1,2,3] = state2[2,3,0,1] */ 131 + ext state2.16b, state2.16b, state2.16b, #8 132 + /* state3[0,1,2,3] = state3[1,2,3,0] */ 133 + ext state3.16b, state3.16b, state3.16b, #4 134 + 135 + subs w4, w4, #2 136 + b.ne .Ldoubleround 137 + 138 + /* output0 = state0 + state0 */ 139 + add state0.4s, state0.4s, copy0.4s 140 + /* output1 = state1 + state1 */ 141 + add state1.4s, state1.4s, copy1.4s 142 + /* output2 = state2 + state2 */ 143 + add state2.4s, state2.4s, copy2.4s 144 + /* output2 = state3 + state3 */ 145 + add state3.4s, state3.4s, copy3.4s 146 + st1 { state0.16b - state3.16b }, [x0] 147 + 148 + /* 149 + * ++copy3.counter, the 'add' clears the upper half of the SIMD register 150 + * which is the expected behaviour here. 151 + */ 152 + add copy3_d, copy3_d, one_d 153 + 154 + /* output += 64, --nblocks */ 155 + add x0, x0, 64 156 + subs x3, x3, #1 157 + b.ne .Lblock 158 + 159 + /* counter = copy3.counter */ 160 + st1 { copy3.2s }, [x2] 161 + 162 + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ 163 + movi state0.16b, #0 164 + movi state1.16b, #0 165 + movi state2.16b, #0 166 + movi state3.16b, #0 167 + movi copy1.16b, #0 168 + movi copy2.16b, #0 169 + ret 170 + SYM_FUNC_END(__arch_chacha20_blocks_nostack) 171 + 172 + emit_aarch64_feature_1_and
+15
arch/arm64/kernel/vdso/vgetrandom.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <uapi/asm-generic/errno.h> 4 + 5 + typeof(__cvdso_getrandom) __kernel_getrandom; 6 + 7 + ssize_t __kernel_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) 8 + { 9 + if (alternative_has_cap_likely(ARM64_HAS_FPSIMD)) 10 + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); 11 + 12 + if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) 13 + return -ENOSYS; 14 + return getrandom_syscall(buffer, len, flags); 15 + }
+4
tools/include/linux/compiler.h
··· 2 2 #ifndef _TOOLS_LINUX_COMPILER_H_ 3 3 #define _TOOLS_LINUX_COMPILER_H_ 4 4 5 + #ifndef __ASSEMBLY__ 6 + 5 7 #include <linux/compiler_types.h> 6 8 7 9 #ifndef __compiletime_error ··· 225 223 #define OPTIMIZER_HIDE_VAR(var) \ 226 224 __asm__ ("" : "=r" (var) : "0" (var)) 227 225 #endif 226 + 227 + #endif /* __ASSEMBLY__ */ 228 228 229 229 #endif /* _TOOLS_LINUX_COMPILER_H */
+2 -1
tools/testing/selftests/vDSO/Makefile
··· 9 9 TEST_GEN_PROGS += vdso_standalone_test_x86 10 10 endif 11 11 TEST_GEN_PROGS += vdso_test_correctness 12 - ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64 loongarch)) 12 + ifeq ($(ARCH)$(CONFIG_X86_32),$(filter $(ARCH)$(CONFIG_X86_32),x86 x86_64 loongarch arm64)) 13 13 TEST_GEN_PROGS += vdso_test_getrandom 14 14 TEST_GEN_PROGS += vdso_test_chacha 15 15 endif ··· 40 40 $(OUTPUT)/vdso_test_chacha: $(top_srcdir)/tools/arch/$(SRCARCH)/vdso/vgetrandom-chacha.S 41 41 $(OUTPUT)/vdso_test_chacha: CFLAGS += -idirafter $(top_srcdir)/tools/include \ 42 42 -idirafter $(top_srcdir)/arch/$(SRCARCH)/include \ 43 + -idirafter $(top_srcdir)/arch/$(SRCARCH)/include/generated \ 43 44 -idirafter $(top_srcdir)/include \ 44 45 -D__ASSEMBLY__ -Wa,--noexecstack