Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: arm64/chacha20 - implement NEON version based on SSE3 code

This is a straight port to arm64/NEON of the x86 SSE3 implementation
of the ChaCha20 stream cipher.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Ard Biesheuvel and committed by
Herbert Xu
8621caa0 02608e02

+620
+6
arch/arm64/crypto/Kconfig
··· 72 72 depends on ARM64 73 73 select CRYPTO_HASH 74 74 75 + config CRYPTO_CHACHA20_NEON 76 + tristate "NEON accelerated ChaCha20 symmetric cipher" 77 + depends on KERNEL_MODE_NEON 78 + select CRYPTO_BLKCIPHER 79 + select CRYPTO_CHACHA20 80 + 75 81 endif
+3
arch/arm64/crypto/Makefile
··· 41 41 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o 42 42 sha512-arm64-y := sha512-glue.o sha512-core.o 43 43 44 + obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o 45 + chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o 46 + 44 47 AFLAGS_aes-ce.o := -DINTERLEAVE=4 45 48 AFLAGS_aes-neon.o := -DINTERLEAVE=4 46 49
+480
arch/arm64/crypto/chacha20-neon-core.S
··· 1 + /* 2 + * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions 3 + * 4 + * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + * 10 + * Based on: 11 + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions 12 + * 13 + * Copyright (C) 2015 Martin Willi 14 + * 15 + * This program is free software; you can redistribute it and/or modify 16 + * it under the terms of the GNU General Public License as published by 17 + * the Free Software Foundation; either version 2 of the License, or 18 + * (at your option) any later version. 19 + */ 20 + 21 + #include <linux/linkage.h> 22 + 23 + .text 24 + .align 6 25 + 26 + ENTRY(chacha20_block_xor_neon) 27 + // x0: Input state matrix, s 28 + // x1: 1 data block output, o 29 + // x2: 1 data block input, i 30 + 31 + // 32 + // This function encrypts one ChaCha20 block by loading the state matrix 33 + // in four NEON registers. It performs matrix operation on four words in 34 + // parallel, but requires shuffling to rearrange the words after each 35 + // round. 36 + // 37 + 38 + // x0..3 = s0..3 39 + ld1 {v0.4s-v3.4s}, [x0] 40 + ld1 {v8.4s-v11.4s}, [x0] 41 + 42 + mov x3, #10 43 + 44 + .Ldoubleround: 45 + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 46 + add v0.4s, v0.4s, v1.4s 47 + eor v3.16b, v3.16b, v0.16b 48 + rev32 v3.8h, v3.8h 49 + 50 + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 51 + add v2.4s, v2.4s, v3.4s 52 + eor v4.16b, v1.16b, v2.16b 53 + shl v1.4s, v4.4s, #12 54 + sri v1.4s, v4.4s, #20 55 + 56 + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 57 + add v0.4s, v0.4s, v1.4s 58 + eor v4.16b, v3.16b, v0.16b 59 + shl v3.4s, v4.4s, #8 60 + sri v3.4s, v4.4s, #24 61 + 62 + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 63 + add v2.4s, v2.4s, v3.4s 64 + eor v4.16b, v1.16b, v2.16b 65 + shl v1.4s, v4.4s, #7 66 + sri v1.4s, v4.4s, #25 67 + 68 + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 69 + ext v1.16b, v1.16b, v1.16b, #4 70 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 71 + ext v2.16b, v2.16b, v2.16b, #8 72 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 73 + ext v3.16b, v3.16b, v3.16b, #12 74 + 75 + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 76 + add v0.4s, v0.4s, v1.4s 77 + eor v3.16b, v3.16b, v0.16b 78 + rev32 v3.8h, v3.8h 79 + 80 + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 81 + add v2.4s, v2.4s, v3.4s 82 + eor v4.16b, v1.16b, v2.16b 83 + shl v1.4s, v4.4s, #12 84 + sri v1.4s, v4.4s, #20 85 + 86 + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 87 + add v0.4s, v0.4s, v1.4s 88 + eor v4.16b, v3.16b, v0.16b 89 + shl v3.4s, v4.4s, #8 90 + sri v3.4s, v4.4s, #24 91 + 92 + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 93 + add v2.4s, v2.4s, v3.4s 94 + eor v4.16b, v1.16b, v2.16b 95 + shl v1.4s, v4.4s, #7 96 + sri v1.4s, v4.4s, #25 97 + 98 + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 99 + ext v1.16b, v1.16b, v1.16b, #12 100 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 101 + ext v2.16b, v2.16b, v2.16b, #8 102 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 103 + ext v3.16b, v3.16b, v3.16b, #4 104 + 105 + subs x3, x3, #1 106 + b.ne .Ldoubleround 107 + 108 + ld1 {v4.16b-v7.16b}, [x2] 109 + 110 + // o0 = i0 ^ (x0 + s0) 111 + add v0.4s, v0.4s, v8.4s 112 + eor v0.16b, v0.16b, v4.16b 113 + 114 + // o1 = i1 ^ (x1 + s1) 115 + add v1.4s, v1.4s, v9.4s 116 + eor v1.16b, v1.16b, v5.16b 117 + 118 + // o2 = i2 ^ (x2 + s2) 119 + add v2.4s, v2.4s, v10.4s 120 + eor v2.16b, v2.16b, v6.16b 121 + 122 + // o3 = i3 ^ (x3 + s3) 123 + add v3.4s, v3.4s, v11.4s 124 + eor v3.16b, v3.16b, v7.16b 125 + 126 + st1 {v0.16b-v3.16b}, [x1] 127 + 128 + ret 129 + ENDPROC(chacha20_block_xor_neon) 130 + 131 + .align 6 132 + ENTRY(chacha20_4block_xor_neon) 133 + // x0: Input state matrix, s 134 + // x1: 4 data blocks output, o 135 + // x2: 4 data blocks input, i 136 + 137 + // 138 + // This function encrypts four consecutive ChaCha20 blocks by loading 139 + // the state matrix in NEON registers four times. The algorithm performs 140 + // each operation on the corresponding word of each state matrix, hence 141 + // requires no word shuffling. For final XORing step we transpose the 142 + // matrix by interleaving 32- and then 64-bit words, which allows us to 143 + // do XOR in NEON registers. 144 + // 145 + adr x3, CTRINC 146 + ld1 {v16.4s}, [x3] 147 + 148 + // x0..15[0-3] = s0..3[0..3] 149 + mov x4, x0 150 + ld4r { v0.4s- v3.4s}, [x4], #16 151 + ld4r { v4.4s- v7.4s}, [x4], #16 152 + ld4r { v8.4s-v11.4s}, [x4], #16 153 + ld4r {v12.4s-v15.4s}, [x4] 154 + 155 + // x12 += counter values 0-3 156 + add v12.4s, v12.4s, v16.4s 157 + 158 + mov x3, #10 159 + 160 + .Ldoubleround4: 161 + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) 162 + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) 163 + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) 164 + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) 165 + add v0.4s, v0.4s, v4.4s 166 + add v1.4s, v1.4s, v5.4s 167 + add v2.4s, v2.4s, v6.4s 168 + add v3.4s, v3.4s, v7.4s 169 + 170 + eor v12.16b, v12.16b, v0.16b 171 + eor v13.16b, v13.16b, v1.16b 172 + eor v14.16b, v14.16b, v2.16b 173 + eor v15.16b, v15.16b, v3.16b 174 + 175 + rev32 v12.8h, v12.8h 176 + rev32 v13.8h, v13.8h 177 + rev32 v14.8h, v14.8h 178 + rev32 v15.8h, v15.8h 179 + 180 + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) 181 + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) 182 + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) 183 + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) 184 + add v8.4s, v8.4s, v12.4s 185 + add v9.4s, v9.4s, v13.4s 186 + add v10.4s, v10.4s, v14.4s 187 + add v11.4s, v11.4s, v15.4s 188 + 189 + eor v17.16b, v4.16b, v8.16b 190 + eor v18.16b, v5.16b, v9.16b 191 + eor v19.16b, v6.16b, v10.16b 192 + eor v20.16b, v7.16b, v11.16b 193 + 194 + shl v4.4s, v17.4s, #12 195 + shl v5.4s, v18.4s, #12 196 + shl v6.4s, v19.4s, #12 197 + shl v7.4s, v20.4s, #12 198 + 199 + sri v4.4s, v17.4s, #20 200 + sri v5.4s, v18.4s, #20 201 + sri v6.4s, v19.4s, #20 202 + sri v7.4s, v20.4s, #20 203 + 204 + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) 205 + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) 206 + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) 207 + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) 208 + add v0.4s, v0.4s, v4.4s 209 + add v1.4s, v1.4s, v5.4s 210 + add v2.4s, v2.4s, v6.4s 211 + add v3.4s, v3.4s, v7.4s 212 + 213 + eor v17.16b, v12.16b, v0.16b 214 + eor v18.16b, v13.16b, v1.16b 215 + eor v19.16b, v14.16b, v2.16b 216 + eor v20.16b, v15.16b, v3.16b 217 + 218 + shl v12.4s, v17.4s, #8 219 + shl v13.4s, v18.4s, #8 220 + shl v14.4s, v19.4s, #8 221 + shl v15.4s, v20.4s, #8 222 + 223 + sri v12.4s, v17.4s, #24 224 + sri v13.4s, v18.4s, #24 225 + sri v14.4s, v19.4s, #24 226 + sri v15.4s, v20.4s, #24 227 + 228 + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) 229 + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) 230 + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) 231 + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) 232 + add v8.4s, v8.4s, v12.4s 233 + add v9.4s, v9.4s, v13.4s 234 + add v10.4s, v10.4s, v14.4s 235 + add v11.4s, v11.4s, v15.4s 236 + 237 + eor v17.16b, v4.16b, v8.16b 238 + eor v18.16b, v5.16b, v9.16b 239 + eor v19.16b, v6.16b, v10.16b 240 + eor v20.16b, v7.16b, v11.16b 241 + 242 + shl v4.4s, v17.4s, #7 243 + shl v5.4s, v18.4s, #7 244 + shl v6.4s, v19.4s, #7 245 + shl v7.4s, v20.4s, #7 246 + 247 + sri v4.4s, v17.4s, #25 248 + sri v5.4s, v18.4s, #25 249 + sri v6.4s, v19.4s, #25 250 + sri v7.4s, v20.4s, #25 251 + 252 + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) 253 + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) 254 + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) 255 + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) 256 + add v0.4s, v0.4s, v5.4s 257 + add v1.4s, v1.4s, v6.4s 258 + add v2.4s, v2.4s, v7.4s 259 + add v3.4s, v3.4s, v4.4s 260 + 261 + eor v15.16b, v15.16b, v0.16b 262 + eor v12.16b, v12.16b, v1.16b 263 + eor v13.16b, v13.16b, v2.16b 264 + eor v14.16b, v14.16b, v3.16b 265 + 266 + rev32 v15.8h, v15.8h 267 + rev32 v12.8h, v12.8h 268 + rev32 v13.8h, v13.8h 269 + rev32 v14.8h, v14.8h 270 + 271 + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) 272 + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) 273 + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) 274 + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) 275 + add v10.4s, v10.4s, v15.4s 276 + add v11.4s, v11.4s, v12.4s 277 + add v8.4s, v8.4s, v13.4s 278 + add v9.4s, v9.4s, v14.4s 279 + 280 + eor v17.16b, v5.16b, v10.16b 281 + eor v18.16b, v6.16b, v11.16b 282 + eor v19.16b, v7.16b, v8.16b 283 + eor v20.16b, v4.16b, v9.16b 284 + 285 + shl v5.4s, v17.4s, #12 286 + shl v6.4s, v18.4s, #12 287 + shl v7.4s, v19.4s, #12 288 + shl v4.4s, v20.4s, #12 289 + 290 + sri v5.4s, v17.4s, #20 291 + sri v6.4s, v18.4s, #20 292 + sri v7.4s, v19.4s, #20 293 + sri v4.4s, v20.4s, #20 294 + 295 + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) 296 + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) 297 + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) 298 + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) 299 + add v0.4s, v0.4s, v5.4s 300 + add v1.4s, v1.4s, v6.4s 301 + add v2.4s, v2.4s, v7.4s 302 + add v3.4s, v3.4s, v4.4s 303 + 304 + eor v17.16b, v15.16b, v0.16b 305 + eor v18.16b, v12.16b, v1.16b 306 + eor v19.16b, v13.16b, v2.16b 307 + eor v20.16b, v14.16b, v3.16b 308 + 309 + shl v15.4s, v17.4s, #8 310 + shl v12.4s, v18.4s, #8 311 + shl v13.4s, v19.4s, #8 312 + shl v14.4s, v20.4s, #8 313 + 314 + sri v15.4s, v17.4s, #24 315 + sri v12.4s, v18.4s, #24 316 + sri v13.4s, v19.4s, #24 317 + sri v14.4s, v20.4s, #24 318 + 319 + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) 320 + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) 321 + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) 322 + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) 323 + add v10.4s, v10.4s, v15.4s 324 + add v11.4s, v11.4s, v12.4s 325 + add v8.4s, v8.4s, v13.4s 326 + add v9.4s, v9.4s, v14.4s 327 + 328 + eor v17.16b, v5.16b, v10.16b 329 + eor v18.16b, v6.16b, v11.16b 330 + eor v19.16b, v7.16b, v8.16b 331 + eor v20.16b, v4.16b, v9.16b 332 + 333 + shl v5.4s, v17.4s, #7 334 + shl v6.4s, v18.4s, #7 335 + shl v7.4s, v19.4s, #7 336 + shl v4.4s, v20.4s, #7 337 + 338 + sri v5.4s, v17.4s, #25 339 + sri v6.4s, v18.4s, #25 340 + sri v7.4s, v19.4s, #25 341 + sri v4.4s, v20.4s, #25 342 + 343 + subs x3, x3, #1 344 + b.ne .Ldoubleround4 345 + 346 + // x0[0-3] += s0[0] 347 + // x1[0-3] += s0[1] 348 + // x2[0-3] += s0[2] 349 + // x3[0-3] += s0[3] 350 + ld4r {v17.4s-v20.4s}, [x0], #16 351 + add v0.4s, v0.4s, v17.4s 352 + add v1.4s, v1.4s, v18.4s 353 + add v2.4s, v2.4s, v19.4s 354 + add v3.4s, v3.4s, v20.4s 355 + 356 + // x4[0-3] += s1[0] 357 + // x5[0-3] += s1[1] 358 + // x6[0-3] += s1[2] 359 + // x7[0-3] += s1[3] 360 + ld4r {v21.4s-v24.4s}, [x0], #16 361 + add v4.4s, v4.4s, v21.4s 362 + add v5.4s, v5.4s, v22.4s 363 + add v6.4s, v6.4s, v23.4s 364 + add v7.4s, v7.4s, v24.4s 365 + 366 + // x8[0-3] += s2[0] 367 + // x9[0-3] += s2[1] 368 + // x10[0-3] += s2[2] 369 + // x11[0-3] += s2[3] 370 + ld4r {v17.4s-v20.4s}, [x0], #16 371 + add v8.4s, v8.4s, v17.4s 372 + add v9.4s, v9.4s, v18.4s 373 + add v10.4s, v10.4s, v19.4s 374 + add v11.4s, v11.4s, v20.4s 375 + 376 + // x12[0-3] += s3[0] 377 + // x13[0-3] += s3[1] 378 + // x14[0-3] += s3[2] 379 + // x15[0-3] += s3[3] 380 + ld4r {v21.4s-v24.4s}, [x0] 381 + add v12.4s, v12.4s, v21.4s 382 + add v13.4s, v13.4s, v22.4s 383 + add v14.4s, v14.4s, v23.4s 384 + add v15.4s, v15.4s, v24.4s 385 + 386 + // x12 += counter values 0-3 387 + add v12.4s, v12.4s, v16.4s 388 + 389 + ld1 {v16.16b-v19.16b}, [x2], #64 390 + ld1 {v20.16b-v23.16b}, [x2], #64 391 + 392 + // interleave 32-bit words in state n, n+1 393 + zip1 v24.4s, v0.4s, v1.4s 394 + zip1 v25.4s, v2.4s, v3.4s 395 + zip1 v26.4s, v4.4s, v5.4s 396 + zip1 v27.4s, v6.4s, v7.4s 397 + zip1 v28.4s, v8.4s, v9.4s 398 + zip1 v29.4s, v10.4s, v11.4s 399 + zip1 v30.4s, v12.4s, v13.4s 400 + zip1 v31.4s, v14.4s, v15.4s 401 + 402 + zip2 v1.4s, v0.4s, v1.4s 403 + zip2 v3.4s, v2.4s, v3.4s 404 + zip2 v5.4s, v4.4s, v5.4s 405 + zip2 v7.4s, v6.4s, v7.4s 406 + zip2 v9.4s, v8.4s, v9.4s 407 + zip2 v11.4s, v10.4s, v11.4s 408 + zip2 v13.4s, v12.4s, v13.4s 409 + zip2 v15.4s, v14.4s, v15.4s 410 + 411 + mov v0.16b, v24.16b 412 + mov v2.16b, v25.16b 413 + mov v4.16b, v26.16b 414 + mov v6.16b, v27.16b 415 + mov v8.16b, v28.16b 416 + mov v10.16b, v29.16b 417 + mov v12.16b, v30.16b 418 + mov v14.16b, v31.16b 419 + 420 + // interleave 64-bit words in state n, n+2 421 + zip1 v24.2d, v0.2d, v2.2d 422 + zip1 v25.2d, v1.2d, v3.2d 423 + zip1 v26.2d, v4.2d, v6.2d 424 + zip1 v27.2d, v5.2d, v7.2d 425 + zip1 v28.2d, v8.2d, v10.2d 426 + zip1 v29.2d, v9.2d, v11.2d 427 + zip1 v30.2d, v12.2d, v14.2d 428 + zip1 v31.2d, v13.2d, v15.2d 429 + 430 + zip2 v2.2d, v0.2d, v2.2d 431 + zip2 v3.2d, v1.2d, v3.2d 432 + zip2 v6.2d, v4.2d, v6.2d 433 + zip2 v7.2d, v5.2d, v7.2d 434 + zip2 v10.2d, v8.2d, v10.2d 435 + zip2 v11.2d, v9.2d, v11.2d 436 + zip2 v14.2d, v12.2d, v14.2d 437 + zip2 v15.2d, v13.2d, v15.2d 438 + 439 + mov v0.16b, v24.16b 440 + mov v1.16b, v25.16b 441 + mov v4.16b, v26.16b 442 + mov v5.16b, v27.16b 443 + 444 + mov v8.16b, v28.16b 445 + mov v9.16b, v29.16b 446 + mov v12.16b, v30.16b 447 + mov v13.16b, v31.16b 448 + 449 + ld1 {v24.16b-v27.16b}, [x2], #64 450 + ld1 {v28.16b-v31.16b}, [x2] 451 + 452 + // xor with corresponding input, write to output 453 + eor v16.16b, v16.16b, v0.16b 454 + eor v17.16b, v17.16b, v4.16b 455 + eor v18.16b, v18.16b, v8.16b 456 + eor v19.16b, v19.16b, v12.16b 457 + st1 {v16.16b-v19.16b}, [x1], #64 458 + 459 + eor v20.16b, v20.16b, v2.16b 460 + eor v21.16b, v21.16b, v6.16b 461 + eor v22.16b, v22.16b, v10.16b 462 + eor v23.16b, v23.16b, v14.16b 463 + st1 {v20.16b-v23.16b}, [x1], #64 464 + 465 + eor v24.16b, v24.16b, v1.16b 466 + eor v25.16b, v25.16b, v5.16b 467 + eor v26.16b, v26.16b, v9.16b 468 + eor v27.16b, v27.16b, v13.16b 469 + st1 {v24.16b-v27.16b}, [x1], #64 470 + 471 + eor v28.16b, v28.16b, v3.16b 472 + eor v29.16b, v29.16b, v7.16b 473 + eor v30.16b, v30.16b, v11.16b 474 + eor v31.16b, v31.16b, v15.16b 475 + st1 {v28.16b-v31.16b}, [x1] 476 + 477 + ret 478 + ENDPROC(chacha20_4block_xor_neon) 479 + 480 + CTRINC: .word 0, 1, 2, 3
+131
arch/arm64/crypto/chacha20-neon-glue.c
··· 1 + /* 2 + * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions 3 + * 4 + * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> 5 + * 6 + * This program is free software; you can redistribute it and/or modify 7 + * it under the terms of the GNU General Public License version 2 as 8 + * published by the Free Software Foundation. 9 + * 10 + * Based on: 11 + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code 12 + * 13 + * Copyright (C) 2015 Martin Willi 14 + * 15 + * This program is free software; you can redistribute it and/or modify 16 + * it under the terms of the GNU General Public License as published by 17 + * the Free Software Foundation; either version 2 of the License, or 18 + * (at your option) any later version. 19 + */ 20 + 21 + #include <crypto/algapi.h> 22 + #include <crypto/chacha20.h> 23 + #include <linux/crypto.h> 24 + #include <linux/kernel.h> 25 + #include <linux/module.h> 26 + 27 + #include <asm/neon.h> 28 + 29 + asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); 30 + asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); 31 + 32 + static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, 33 + unsigned int bytes) 34 + { 35 + u8 buf[CHACHA20_BLOCK_SIZE]; 36 + 37 + while (bytes >= CHACHA20_BLOCK_SIZE * 4) { 38 + chacha20_4block_xor_neon(state, dst, src); 39 + bytes -= CHACHA20_BLOCK_SIZE * 4; 40 + src += CHACHA20_BLOCK_SIZE * 4; 41 + dst += CHACHA20_BLOCK_SIZE * 4; 42 + state[12] += 4; 43 + } 44 + while (bytes >= CHACHA20_BLOCK_SIZE) { 45 + chacha20_block_xor_neon(state, dst, src); 46 + bytes -= CHACHA20_BLOCK_SIZE; 47 + src += CHACHA20_BLOCK_SIZE; 48 + dst += CHACHA20_BLOCK_SIZE; 49 + state[12]++; 50 + } 51 + if (bytes) { 52 + memcpy(buf, src, bytes); 53 + chacha20_block_xor_neon(state, buf, buf); 54 + memcpy(dst, buf, bytes); 55 + } 56 + } 57 + 58 + static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst, 59 + struct scatterlist *src, unsigned int nbytes) 60 + { 61 + struct blkcipher_walk walk; 62 + u32 state[16]; 63 + int err; 64 + 65 + if (nbytes <= CHACHA20_BLOCK_SIZE) 66 + return crypto_chacha20_crypt(desc, dst, src, nbytes); 67 + 68 + blkcipher_walk_init(&walk, dst, src, nbytes); 69 + err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE); 70 + 71 + crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv); 72 + 73 + kernel_neon_begin(); 74 + 75 + while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { 76 + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, 77 + rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); 78 + err = blkcipher_walk_done(desc, &walk, 79 + walk.nbytes % CHACHA20_BLOCK_SIZE); 80 + } 81 + 82 + if (walk.nbytes) { 83 + chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, 84 + walk.nbytes); 85 + err = blkcipher_walk_done(desc, &walk, 0); 86 + } 87 + 88 + kernel_neon_end(); 89 + 90 + return err; 91 + } 92 + 93 + static struct crypto_alg alg = { 94 + .cra_name = "chacha20", 95 + .cra_driver_name = "chacha20-neon", 96 + .cra_priority = 300, 97 + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, 98 + .cra_blocksize = 1, 99 + .cra_type = &crypto_blkcipher_type, 100 + .cra_ctxsize = sizeof(struct chacha20_ctx), 101 + .cra_alignmask = sizeof(u32) - 1, 102 + .cra_module = THIS_MODULE, 103 + .cra_u = { 104 + .blkcipher = { 105 + .min_keysize = CHACHA20_KEY_SIZE, 106 + .max_keysize = CHACHA20_KEY_SIZE, 107 + .ivsize = CHACHA20_IV_SIZE, 108 + .geniv = "seqiv", 109 + .setkey = crypto_chacha20_setkey, 110 + .encrypt = chacha20_simd, 111 + .decrypt = chacha20_simd, 112 + }, 113 + }, 114 + }; 115 + 116 + static int __init chacha20_simd_mod_init(void) 117 + { 118 + return crypto_register_alg(&alg); 119 + } 120 + 121 + static void __exit chacha20_simd_mod_fini(void) 122 + { 123 + crypto_unregister_alg(&alg); 124 + } 125 + 126 + module_init(chacha20_simd_mod_init); 127 + module_exit(chacha20_simd_mod_fini); 128 + 129 + MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); 130 + MODULE_LICENSE("GPL v2"); 131 + MODULE_ALIAS_CRYPTO("chacha20");