crypto: arm64/aes - avoid expanded lookup tables in the final round

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

For the final round, avoid the expanded and padded lookup tables
exported by the generic AES driver. Instead, for encryption, we can
perform byte loads from the same table we used for the inner rounds,
which will still be hot in the caches. For decryption, use the inverse
AES Sbox directly, which is 4x smaller than the inverse lookup table
exported by the generic driver.

This should significantly reduce the Dcache footprint of our code,
which makes the code more robust against timing attacks. It does not
introduce any additional module dependencies, given that we already
rely on the core AES module for the shared key expansion routines.
It also frees up register x18, which is not available as a scratch
register on all platforms, which and so avoiding it improves
shareability of this code.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by

Herbert Xu 8 years ago 7c83d689 0d149ce6

+106 -44

1 changed file

expand all

arch

arm64

crypto

aes-cipher-core.S

+106 -44

arch/arm64/crypto/aes-cipher-core.S

··· 10 10 11 11 #include <linux/linkage.h> 12 12 #include <asm/assembler.h> 13 + #include <asm/cache.h> 13 14 14 15 .text 15 16 ··· 18 17 out .req x1 19 18 in .req x2 20 19 rounds .req x3 21 - tt .req x4 22 - lt .req x2 20 + tt .req x2 23 21 24 - .macro __pair, enc, reg0, reg1, in0, in1e, in1d, shift 25 - ubfx \reg0, \in0, #\shift, #8 26 - .if \enc 27 - ubfx \reg1, \in1e, #\shift, #8 22 + .macro __pair1, sz, op, reg0, reg1, in0, in1e, in1d, shift 23 + .ifc \op\shift, b0 24 + ubfiz \reg0, \in0, #2, #8 25 + ubfiz \reg1, \in1e, #2, #8 28 26 .else 29 - ubfx \reg1, \in1d, #\shift, #8 27 + ubfx \reg0, \in0, #\shift, #8 28 + ubfx \reg1, \in1e, #\shift, #8 30 29 .endif 30 + 31 + /* 32 + * AArch64 cannot do byte size indexed loads from a table containing 33 + * 32-bit quantities, i.e., 'ldrb w12, [tt, w12, uxtw #2]' is not a 34 + * valid instruction. So perform the shift explicitly first for the 35 + * high bytes (the low byte is shifted implicitly by using ubfiz rather 36 + * than ubfx above) 37 + */ 38 + .ifnc \op, b 31 39 ldr \reg0, [tt, \reg0, uxtw #2] 32 40 ldr \reg1, [tt, \reg1, uxtw #2] 41 + .else 42 + .if \shift > 0 43 + lsl \reg0, \reg0, #2 44 + lsl \reg1, \reg1, #2 45 + .endif 46 + ldrb \reg0, [tt, \reg0, uxtw] 47 + ldrb \reg1, [tt, \reg1, uxtw] 48 + .endif 33 49 .endm 34 50 35 - .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc 51 + .macro __pair0, sz, op, reg0, reg1, in0, in1e, in1d, shift 52 + ubfx \reg0, \in0, #\shift, #8 53 + ubfx \reg1, \in1d, #\shift, #8 54 + ldr\op \reg0, [tt, \reg0, uxtw #\sz] 55 + ldr\op \reg1, [tt, \reg1, uxtw #\sz] 56 + .endm 57 + 58 + .macro __hround, out0, out1, in0, in1, in2, in3, t0, t1, enc, sz, op 36 59 ldp \out0, \out1, [rk], #8 37 60 38 - __pair \enc, w13, w14, \in0, \in1, \in3, 0 39 - __pair \enc, w15, w16, \in1, \in2, \in0, 8 40 - __pair \enc, w17, w18, \in2, \in3, \in1, 16 41 - __pair \enc, \t0, \t1, \in3, \in0, \in2, 24 61 + __pair\enc \sz, \op, w12, w13, \in0, \in1, \in3, 0 62 + __pair\enc \sz, \op, w14, w15, \in1, \in2, \in0, 8 63 + __pair\enc \sz, \op, w16, w17, \in2, \in3, \in1, 16 64 + __pair\enc \sz, \op, \t0, \t1, \in3, \in0, \in2, 24 42 65 43 - eor \out0, \out0, w13 44 - eor \out1, \out1, w14 45 - eor \out0, \out0, w15, ror #24 46 - eor \out1, \out1, w16, ror #24 47 - eor \out0, \out0, w17, ror #16 48 - eor \out1, \out1, w18, ror #16 66 + eor \out0, \out0, w12 67 + eor \out1, \out1, w13 68 + eor \out0, \out0, w14, ror #24 69 + eor \out1, \out1, w15, ror #24 70 + eor \out0, \out0, w16, ror #16 71 + eor \out1, \out1, w17, ror #16 49 72 eor \out0, \out0, \t0, ror #8 50 73 eor \out1, \out1, \t1, ror #8 51 74 .endm 52 75 53 - .macro fround, out0, out1, out2, out3, in0, in1, in2, in3 54 - __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1 55 - __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1 76 + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op 77 + __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op 78 + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op 56 79 .endm 57 80 58 - .macro iround, out0, out1, out2, out3, in0, in1, in2, in3 59 - __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0 60 - __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0 81 + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op 82 + __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op 83 + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op 61 84 .endm 62 85 63 - .macro do_crypt, round, ttab, ltab 64 - ldp w5, w6, [in] 65 - ldp w7, w8, [in, #8] 66 - ldp w9, w10, [rk], #16 67 - ldp w11, w12, [rk, #-8] 86 + .macro do_crypt, round, ttab, ltab, bsz 87 + ldp w4, w5, [in] 88 + ldp w6, w7, [in, #8] 89 + ldp w8, w9, [rk], #16 90 + ldp w10, w11, [rk, #-8] 68 91 92 + CPU_BE( rev w4, w4 ) 69 93 CPU_BE( rev w5, w5 ) 70 94 CPU_BE( rev w6, w6 ) 71 95 CPU_BE( rev w7, w7 ) 72 - CPU_BE( rev w8, w8 ) 73 96 97 + eor w4, w4, w8 74 98 eor w5, w5, w9 75 99 eor w6, w6, w10 76 100 eor w7, w7, w11 77 - eor w8, w8, w12 78 101 79 102 adr_l tt, \ttab 80 - adr_l lt, \ltab 81 103 82 104 tbnz rounds, #1, 1f 83 105 84 - 0: \round w9, w10, w11, w12, w5, w6, w7, w8 85 - \round w5, w6, w7, w8, w9, w10, w11, w12 106 + 0: \round w8, w9, w10, w11, w4, w5, w6, w7 107 + \round w4, w5, w6, w7, w8, w9, w10, w11 86 108 87 109 1: subs rounds, rounds, #4 88 - \round w9, w10, w11, w12, w5, w6, w7, w8 89 - csel tt, tt, lt, hi 90 - \round w5, w6, w7, w8, w9, w10, w11, w12 91 - b.hi 0b 110 + \round w8, w9, w10, w11, w4, w5, w6, w7 111 + b.ls 3f 112 + 2: \round w4, w5, w6, w7, w8, w9, w10, w11 113 + b 0b 114 + 3: adr_l tt, \ltab 115 + \round w4, w5, w6, w7, w8, w9, w10, w11, \bsz, b 92 116 117 + CPU_BE( rev w4, w4 ) 93 118 CPU_BE( rev w5, w5 ) 94 119 CPU_BE( rev w6, w6 ) 95 120 CPU_BE( rev w7, w7 ) 96 - CPU_BE( rev w8, w8 ) 97 121 98 - stp w5, w6, [out] 99 - stp w7, w8, [out, #8] 122 + stp w4, w5, [out] 123 + stp w6, w7, [out, #8] 100 124 ret 101 125 .endm 102 126 103 - .align 5 127 + .align L1_CACHE_SHIFT 128 + .type __aes_arm64_inverse_sbox, %object 129 + __aes_arm64_inverse_sbox: 130 + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 131 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb 132 + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 133 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb 134 + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d 135 + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e 136 + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 137 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 138 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 139 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 140 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda 141 + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 142 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a 143 + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 144 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 145 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b 146 + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea 147 + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 148 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 149 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e 150 + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 151 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b 152 + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 153 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 154 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 155 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f 156 + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d 157 + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef 158 + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 159 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 160 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 161 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d 162 + .size __aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox 163 + 104 164 ENTRY(__aes_arm64_encrypt) 105 - do_crypt fround, crypto_ft_tab, crypto_fl_tab 165 + do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 106 166 ENDPROC(__aes_arm64_encrypt) 107 167 108 168 .align 5 109 169 ENTRY(__aes_arm64_decrypt) 110 - do_crypt iround, crypto_it_tab, crypto_il_tab 170 + do_crypt iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0 111 171 ENDPROC(__aes_arm64_decrypt)