Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: chacha20 - Fix chacha20_block() keystream alignment (again)

In commit 9f480faec58c ("crypto: chacha20 - Fix keystream alignment for
chacha20_block()"), I had missed that chacha20_block() can be called
directly on the buffer passed to get_random_bytes(), which can have any
alignment. So, while my commit didn't break anything, it didn't fully
solve the alignment problems.

Revert my solution and just update chacha20_block() to use
put_unaligned_le32(), so the output buffer need not be aligned.
This is simpler, and on many CPUs it's the same speed.

But, I kept the 'tmp' buffers in extract_crng_user() and
_get_random_bytes() 4-byte aligned, since that alignment is actually
needed for _crng_backtrack_protect() too.

Reported-by: Stephan Müller <smueller@chronox.de>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Eric Biggers and committed by
Herbert Xu
a5e9f557 78105c7e

+20 -20
+4 -3
crypto/chacha20_generic.c
··· 18 18 static void chacha20_docrypt(u32 *state, u8 *dst, const u8 *src, 19 19 unsigned int bytes) 20 20 { 21 - u32 stream[CHACHA20_BLOCK_WORDS]; 21 + /* aligned to potentially speed up crypto_xor() */ 22 + u8 stream[CHACHA20_BLOCK_SIZE] __aligned(sizeof(long)); 22 23 23 24 if (dst != src) 24 25 memcpy(dst, src, bytes); 25 26 26 27 while (bytes >= CHACHA20_BLOCK_SIZE) { 27 28 chacha20_block(state, stream); 28 - crypto_xor(dst, (const u8 *)stream, CHACHA20_BLOCK_SIZE); 29 + crypto_xor(dst, stream, CHACHA20_BLOCK_SIZE); 29 30 bytes -= CHACHA20_BLOCK_SIZE; 30 31 dst += CHACHA20_BLOCK_SIZE; 31 32 } 32 33 if (bytes) { 33 34 chacha20_block(state, stream); 34 - crypto_xor(dst, (const u8 *)stream, bytes); 35 + crypto_xor(dst, stream, bytes); 35 36 } 36 37 } 37 38
+12 -12
drivers/char/random.c
··· 433 433 static unsigned long crng_global_init_time = 0; 434 434 #define CRNG_INIT_CNT_THRESH (2*CHACHA20_KEY_SIZE) 435 435 static void _extract_crng(struct crng_state *crng, 436 - __u32 out[CHACHA20_BLOCK_WORDS]); 436 + __u8 out[CHACHA20_BLOCK_SIZE]); 437 437 static void _crng_backtrack_protect(struct crng_state *crng, 438 - __u32 tmp[CHACHA20_BLOCK_WORDS], int used); 438 + __u8 tmp[CHACHA20_BLOCK_SIZE], int used); 439 439 static void process_random_ready_list(void); 440 440 static void _get_random_bytes(void *buf, int nbytes); 441 441 ··· 921 921 unsigned long flags; 922 922 int i, num; 923 923 union { 924 - __u32 block[CHACHA20_BLOCK_WORDS]; 924 + __u8 block[CHACHA20_BLOCK_SIZE]; 925 925 __u32 key[8]; 926 926 } buf; 927 927 ··· 968 968 } 969 969 970 970 static void _extract_crng(struct crng_state *crng, 971 - __u32 out[CHACHA20_BLOCK_WORDS]) 971 + __u8 out[CHACHA20_BLOCK_SIZE]) 972 972 { 973 973 unsigned long v, flags; 974 974 ··· 985 985 spin_unlock_irqrestore(&crng->lock, flags); 986 986 } 987 987 988 - static void extract_crng(__u32 out[CHACHA20_BLOCK_WORDS]) 988 + static void extract_crng(__u8 out[CHACHA20_BLOCK_SIZE]) 989 989 { 990 990 struct crng_state *crng = NULL; 991 991 ··· 1003 1003 * enough) to mutate the CRNG key to provide backtracking protection. 1004 1004 */ 1005 1005 static void _crng_backtrack_protect(struct crng_state *crng, 1006 - __u32 tmp[CHACHA20_BLOCK_WORDS], int used) 1006 + __u8 tmp[CHACHA20_BLOCK_SIZE], int used) 1007 1007 { 1008 1008 unsigned long flags; 1009 1009 __u32 *s, *d; ··· 1015 1015 used = 0; 1016 1016 } 1017 1017 spin_lock_irqsave(&crng->lock, flags); 1018 - s = &tmp[used / sizeof(__u32)]; 1018 + s = (__u32 *) &tmp[used]; 1019 1019 d = &crng->state[4]; 1020 1020 for (i=0; i < 8; i++) 1021 1021 *d++ ^= *s++; 1022 1022 spin_unlock_irqrestore(&crng->lock, flags); 1023 1023 } 1024 1024 1025 - static void crng_backtrack_protect(__u32 tmp[CHACHA20_BLOCK_WORDS], int used) 1025 + static void crng_backtrack_protect(__u8 tmp[CHACHA20_BLOCK_SIZE], int used) 1026 1026 { 1027 1027 struct crng_state *crng = NULL; 1028 1028 ··· 1038 1038 static ssize_t extract_crng_user(void __user *buf, size_t nbytes) 1039 1039 { 1040 1040 ssize_t ret = 0, i = CHACHA20_BLOCK_SIZE; 1041 - __u32 tmp[CHACHA20_BLOCK_WORDS]; 1041 + __u8 tmp[CHACHA20_BLOCK_SIZE] __aligned(4); 1042 1042 int large_request = (nbytes > 256); 1043 1043 1044 1044 while (nbytes) { ··· 1617 1617 */ 1618 1618 static void _get_random_bytes(void *buf, int nbytes) 1619 1619 { 1620 - __u32 tmp[CHACHA20_BLOCK_WORDS]; 1620 + __u8 tmp[CHACHA20_BLOCK_SIZE] __aligned(4); 1621 1621 1622 1622 trace_get_random_bytes(nbytes, _RET_IP_); 1623 1623 ··· 2243 2243 if (use_lock) 2244 2244 read_lock_irqsave(&batched_entropy_reset_lock, flags); 2245 2245 if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) { 2246 - extract_crng((__u32 *)batch->entropy_u64); 2246 + extract_crng((u8 *)batch->entropy_u64); 2247 2247 batch->position = 0; 2248 2248 } 2249 2249 ret = batch->entropy_u64[batch->position++]; ··· 2273 2273 if (use_lock) 2274 2274 read_lock_irqsave(&batched_entropy_reset_lock, flags); 2275 2275 if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) { 2276 - extract_crng(batch->entropy_u32); 2276 + extract_crng((u8 *)batch->entropy_u32); 2277 2277 batch->position = 0; 2278 2278 } 2279 2279 ret = batch->entropy_u32[batch->position++];
+1 -2
include/crypto/chacha20.h
··· 13 13 #define CHACHA20_IV_SIZE 16 14 14 #define CHACHA20_KEY_SIZE 32 15 15 #define CHACHA20_BLOCK_SIZE 64 16 - #define CHACHA20_BLOCK_WORDS (CHACHA20_BLOCK_SIZE / sizeof(u32)) 17 16 18 17 struct chacha20_ctx { 19 18 u32 key[8]; 20 19 }; 21 20 22 - void chacha20_block(u32 *state, u32 *stream); 21 + void chacha20_block(u32 *state, u8 *stream); 23 22 void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv); 24 23 int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, 25 24 unsigned int keysize);
+3 -3
lib/chacha20.c
··· 16 16 #include <asm/unaligned.h> 17 17 #include <crypto/chacha20.h> 18 18 19 - void chacha20_block(u32 *state, u32 *stream) 19 + void chacha20_block(u32 *state, u8 *stream) 20 20 { 21 - u32 x[16], *out = stream; 21 + u32 x[16]; 22 22 int i; 23 23 24 24 for (i = 0; i < ARRAY_SIZE(x); i++) ··· 67 67 } 68 68 69 69 for (i = 0; i < ARRAY_SIZE(x); i++) 70 - out[i] = cpu_to_le32(x[i] + state[i]); 70 + put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]); 71 71 72 72 state[12]++; 73 73 }