Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: arm64/sm4 - refactor and simplify CE implementation

This patch does not add new features, but only refactors and simplifies the
implementation of the Crypto Extension acceleration of the SM4 algorithm:

Extract the macro optimized by SM4 Crypto Extension for reuse in the
subsequent optimization of CCM/GCM modes.

Encryption in CBC and CFB modes processes four blocks at a time instead of
one, allowing the ld1 instruction to load 64 bytes of data at a time, which
will reduces unnecessary memory accesses.

CBC/CFB/CTR makes full use of free registers to reduce redundant memory
accesses, and rearranges some instructions to improve out-of-order execution
capabilities.

Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Tianjia Zhang and committed by
Herbert Xu
ce41fefd 3c383637

+523 -404
+209
arch/arm64/crypto/sm4-ce-asm.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * SM4 helper macros for Crypto Extensions 4 + * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 5 + */ 6 + 7 + #define SM4_PREPARE(ptr) \ 8 + ld1 {v24.16b-v27.16b}, [ptr], #64; \ 9 + ld1 {v28.16b-v31.16b}, [ptr]; 10 + 11 + #define SM4_CRYPT_BLK_BE(b0) \ 12 + sm4e b0.4s, v24.4s; \ 13 + sm4e b0.4s, v25.4s; \ 14 + sm4e b0.4s, v26.4s; \ 15 + sm4e b0.4s, v27.4s; \ 16 + sm4e b0.4s, v28.4s; \ 17 + sm4e b0.4s, v29.4s; \ 18 + sm4e b0.4s, v30.4s; \ 19 + sm4e b0.4s, v31.4s; \ 20 + rev64 b0.4s, b0.4s; \ 21 + ext b0.16b, b0.16b, b0.16b, #8; \ 22 + rev32 b0.16b, b0.16b; 23 + 24 + #define SM4_CRYPT_BLK(b0) \ 25 + rev32 b0.16b, b0.16b; \ 26 + SM4_CRYPT_BLK_BE(b0); 27 + 28 + #define SM4_CRYPT_BLK2_BE(b0, b1) \ 29 + sm4e b0.4s, v24.4s; \ 30 + sm4e b1.4s, v24.4s; \ 31 + sm4e b0.4s, v25.4s; \ 32 + sm4e b1.4s, v25.4s; \ 33 + sm4e b0.4s, v26.4s; \ 34 + sm4e b1.4s, v26.4s; \ 35 + sm4e b0.4s, v27.4s; \ 36 + sm4e b1.4s, v27.4s; \ 37 + sm4e b0.4s, v28.4s; \ 38 + sm4e b1.4s, v28.4s; \ 39 + sm4e b0.4s, v29.4s; \ 40 + sm4e b1.4s, v29.4s; \ 41 + sm4e b0.4s, v30.4s; \ 42 + sm4e b1.4s, v30.4s; \ 43 + sm4e b0.4s, v31.4s; \ 44 + sm4e b1.4s, v31.4s; \ 45 + rev64 b0.4s, b0.4s; \ 46 + rev64 b1.4s, b1.4s; \ 47 + ext b0.16b, b0.16b, b0.16b, #8; \ 48 + ext b1.16b, b1.16b, b1.16b, #8; \ 49 + rev32 b0.16b, b0.16b; \ 50 + rev32 b1.16b, b1.16b; \ 51 + 52 + #define SM4_CRYPT_BLK2(b0, b1) \ 53 + rev32 b0.16b, b0.16b; \ 54 + rev32 b1.16b, b1.16b; \ 55 + SM4_CRYPT_BLK2_BE(b0, b1); 56 + 57 + #define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \ 58 + sm4e b0.4s, v24.4s; \ 59 + sm4e b1.4s, v24.4s; \ 60 + sm4e b2.4s, v24.4s; \ 61 + sm4e b3.4s, v24.4s; \ 62 + sm4e b0.4s, v25.4s; \ 63 + sm4e b1.4s, v25.4s; \ 64 + sm4e b2.4s, v25.4s; \ 65 + sm4e b3.4s, v25.4s; \ 66 + sm4e b0.4s, v26.4s; \ 67 + sm4e b1.4s, v26.4s; \ 68 + sm4e b2.4s, v26.4s; \ 69 + sm4e b3.4s, v26.4s; \ 70 + sm4e b0.4s, v27.4s; \ 71 + sm4e b1.4s, v27.4s; \ 72 + sm4e b2.4s, v27.4s; \ 73 + sm4e b3.4s, v27.4s; \ 74 + sm4e b0.4s, v28.4s; \ 75 + sm4e b1.4s, v28.4s; \ 76 + sm4e b2.4s, v28.4s; \ 77 + sm4e b3.4s, v28.4s; \ 78 + sm4e b0.4s, v29.4s; \ 79 + sm4e b1.4s, v29.4s; \ 80 + sm4e b2.4s, v29.4s; \ 81 + sm4e b3.4s, v29.4s; \ 82 + sm4e b0.4s, v30.4s; \ 83 + sm4e b1.4s, v30.4s; \ 84 + sm4e b2.4s, v30.4s; \ 85 + sm4e b3.4s, v30.4s; \ 86 + sm4e b0.4s, v31.4s; \ 87 + sm4e b1.4s, v31.4s; \ 88 + sm4e b2.4s, v31.4s; \ 89 + sm4e b3.4s, v31.4s; \ 90 + rev64 b0.4s, b0.4s; \ 91 + rev64 b1.4s, b1.4s; \ 92 + rev64 b2.4s, b2.4s; \ 93 + rev64 b3.4s, b3.4s; \ 94 + ext b0.16b, b0.16b, b0.16b, #8; \ 95 + ext b1.16b, b1.16b, b1.16b, #8; \ 96 + ext b2.16b, b2.16b, b2.16b, #8; \ 97 + ext b3.16b, b3.16b, b3.16b, #8; \ 98 + rev32 b0.16b, b0.16b; \ 99 + rev32 b1.16b, b1.16b; \ 100 + rev32 b2.16b, b2.16b; \ 101 + rev32 b3.16b, b3.16b; 102 + 103 + #define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ 104 + rev32 b0.16b, b0.16b; \ 105 + rev32 b1.16b, b1.16b; \ 106 + rev32 b2.16b, b2.16b; \ 107 + rev32 b3.16b, b3.16b; \ 108 + SM4_CRYPT_BLK4_BE(b0, b1, b2, b3); 109 + 110 + #define SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7) \ 111 + sm4e b0.4s, v24.4s; \ 112 + sm4e b1.4s, v24.4s; \ 113 + sm4e b2.4s, v24.4s; \ 114 + sm4e b3.4s, v24.4s; \ 115 + sm4e b4.4s, v24.4s; \ 116 + sm4e b5.4s, v24.4s; \ 117 + sm4e b6.4s, v24.4s; \ 118 + sm4e b7.4s, v24.4s; \ 119 + sm4e b0.4s, v25.4s; \ 120 + sm4e b1.4s, v25.4s; \ 121 + sm4e b2.4s, v25.4s; \ 122 + sm4e b3.4s, v25.4s; \ 123 + sm4e b4.4s, v25.4s; \ 124 + sm4e b5.4s, v25.4s; \ 125 + sm4e b6.4s, v25.4s; \ 126 + sm4e b7.4s, v25.4s; \ 127 + sm4e b0.4s, v26.4s; \ 128 + sm4e b1.4s, v26.4s; \ 129 + sm4e b2.4s, v26.4s; \ 130 + sm4e b3.4s, v26.4s; \ 131 + sm4e b4.4s, v26.4s; \ 132 + sm4e b5.4s, v26.4s; \ 133 + sm4e b6.4s, v26.4s; \ 134 + sm4e b7.4s, v26.4s; \ 135 + sm4e b0.4s, v27.4s; \ 136 + sm4e b1.4s, v27.4s; \ 137 + sm4e b2.4s, v27.4s; \ 138 + sm4e b3.4s, v27.4s; \ 139 + sm4e b4.4s, v27.4s; \ 140 + sm4e b5.4s, v27.4s; \ 141 + sm4e b6.4s, v27.4s; \ 142 + sm4e b7.4s, v27.4s; \ 143 + sm4e b0.4s, v28.4s; \ 144 + sm4e b1.4s, v28.4s; \ 145 + sm4e b2.4s, v28.4s; \ 146 + sm4e b3.4s, v28.4s; \ 147 + sm4e b4.4s, v28.4s; \ 148 + sm4e b5.4s, v28.4s; \ 149 + sm4e b6.4s, v28.4s; \ 150 + sm4e b7.4s, v28.4s; \ 151 + sm4e b0.4s, v29.4s; \ 152 + sm4e b1.4s, v29.4s; \ 153 + sm4e b2.4s, v29.4s; \ 154 + sm4e b3.4s, v29.4s; \ 155 + sm4e b4.4s, v29.4s; \ 156 + sm4e b5.4s, v29.4s; \ 157 + sm4e b6.4s, v29.4s; \ 158 + sm4e b7.4s, v29.4s; \ 159 + sm4e b0.4s, v30.4s; \ 160 + sm4e b1.4s, v30.4s; \ 161 + sm4e b2.4s, v30.4s; \ 162 + sm4e b3.4s, v30.4s; \ 163 + sm4e b4.4s, v30.4s; \ 164 + sm4e b5.4s, v30.4s; \ 165 + sm4e b6.4s, v30.4s; \ 166 + sm4e b7.4s, v30.4s; \ 167 + sm4e b0.4s, v31.4s; \ 168 + sm4e b1.4s, v31.4s; \ 169 + sm4e b2.4s, v31.4s; \ 170 + sm4e b3.4s, v31.4s; \ 171 + sm4e b4.4s, v31.4s; \ 172 + sm4e b5.4s, v31.4s; \ 173 + sm4e b6.4s, v31.4s; \ 174 + sm4e b7.4s, v31.4s; \ 175 + rev64 b0.4s, b0.4s; \ 176 + rev64 b1.4s, b1.4s; \ 177 + rev64 b2.4s, b2.4s; \ 178 + rev64 b3.4s, b3.4s; \ 179 + rev64 b4.4s, b4.4s; \ 180 + rev64 b5.4s, b5.4s; \ 181 + rev64 b6.4s, b6.4s; \ 182 + rev64 b7.4s, b7.4s; \ 183 + ext b0.16b, b0.16b, b0.16b, #8; \ 184 + ext b1.16b, b1.16b, b1.16b, #8; \ 185 + ext b2.16b, b2.16b, b2.16b, #8; \ 186 + ext b3.16b, b3.16b, b3.16b, #8; \ 187 + ext b4.16b, b4.16b, b4.16b, #8; \ 188 + ext b5.16b, b5.16b, b5.16b, #8; \ 189 + ext b6.16b, b6.16b, b6.16b, #8; \ 190 + ext b7.16b, b7.16b, b7.16b, #8; \ 191 + rev32 b0.16b, b0.16b; \ 192 + rev32 b1.16b, b1.16b; \ 193 + rev32 b2.16b, b2.16b; \ 194 + rev32 b3.16b, b3.16b; \ 195 + rev32 b4.16b, b4.16b; \ 196 + rev32 b5.16b, b5.16b; \ 197 + rev32 b6.16b, b6.16b; \ 198 + rev32 b7.16b, b7.16b; 199 + 200 + #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ 201 + rev32 b0.16b, b0.16b; \ 202 + rev32 b1.16b, b1.16b; \ 203 + rev32 b2.16b, b2.16b; \ 204 + rev32 b3.16b, b3.16b; \ 205 + rev32 b4.16b, b4.16b; \ 206 + rev32 b5.16b, b5.16b; \ 207 + rev32 b6.16b, b6.16b; \ 208 + rev32 b7.16b, b7.16b; \ 209 + SM4_CRYPT_BLK8_BE(b0, b1, b2, b3, b4, b5, b6, b7);
+286 -366
arch/arm64/crypto/sm4-ce-core.S
··· 10 10 11 11 #include <linux/linkage.h> 12 12 #include <asm/assembler.h> 13 + #include "sm4-ce-asm.h" 13 14 14 15 .arch armv8-a+crypto 15 16 16 - .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 16, 20, 24, 25, 26, 27, 28, 29, 30, 31 17 + .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 18 + 20, 24, 25, 26, 27, 28, 29, 30, 31 17 19 .set .Lv\b\().4s, \b 18 20 .endr 19 21 ··· 35 33 #define RTMP3 v19 36 34 37 35 #define RIV v20 38 - 39 - /* Helper macros. */ 40 - 41 - #define PREPARE \ 42 - ld1 {v24.16b-v27.16b}, [x0], #64; \ 43 - ld1 {v28.16b-v31.16b}, [x0]; 44 - 45 - #define SM4_CRYPT_BLK(b0) \ 46 - rev32 b0.16b, b0.16b; \ 47 - sm4e b0.4s, v24.4s; \ 48 - sm4e b0.4s, v25.4s; \ 49 - sm4e b0.4s, v26.4s; \ 50 - sm4e b0.4s, v27.4s; \ 51 - sm4e b0.4s, v28.4s; \ 52 - sm4e b0.4s, v29.4s; \ 53 - sm4e b0.4s, v30.4s; \ 54 - sm4e b0.4s, v31.4s; \ 55 - rev64 b0.4s, b0.4s; \ 56 - ext b0.16b, b0.16b, b0.16b, #8; \ 57 - rev32 b0.16b, b0.16b; 58 - 59 - #define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ 60 - rev32 b0.16b, b0.16b; \ 61 - rev32 b1.16b, b1.16b; \ 62 - rev32 b2.16b, b2.16b; \ 63 - rev32 b3.16b, b3.16b; \ 64 - sm4e b0.4s, v24.4s; \ 65 - sm4e b1.4s, v24.4s; \ 66 - sm4e b2.4s, v24.4s; \ 67 - sm4e b3.4s, v24.4s; \ 68 - sm4e b0.4s, v25.4s; \ 69 - sm4e b1.4s, v25.4s; \ 70 - sm4e b2.4s, v25.4s; \ 71 - sm4e b3.4s, v25.4s; \ 72 - sm4e b0.4s, v26.4s; \ 73 - sm4e b1.4s, v26.4s; \ 74 - sm4e b2.4s, v26.4s; \ 75 - sm4e b3.4s, v26.4s; \ 76 - sm4e b0.4s, v27.4s; \ 77 - sm4e b1.4s, v27.4s; \ 78 - sm4e b2.4s, v27.4s; \ 79 - sm4e b3.4s, v27.4s; \ 80 - sm4e b0.4s, v28.4s; \ 81 - sm4e b1.4s, v28.4s; \ 82 - sm4e b2.4s, v28.4s; \ 83 - sm4e b3.4s, v28.4s; \ 84 - sm4e b0.4s, v29.4s; \ 85 - sm4e b1.4s, v29.4s; \ 86 - sm4e b2.4s, v29.4s; \ 87 - sm4e b3.4s, v29.4s; \ 88 - sm4e b0.4s, v30.4s; \ 89 - sm4e b1.4s, v30.4s; \ 90 - sm4e b2.4s, v30.4s; \ 91 - sm4e b3.4s, v30.4s; \ 92 - sm4e b0.4s, v31.4s; \ 93 - sm4e b1.4s, v31.4s; \ 94 - sm4e b2.4s, v31.4s; \ 95 - sm4e b3.4s, v31.4s; \ 96 - rev64 b0.4s, b0.4s; \ 97 - rev64 b1.4s, b1.4s; \ 98 - rev64 b2.4s, b2.4s; \ 99 - rev64 b3.4s, b3.4s; \ 100 - ext b0.16b, b0.16b, b0.16b, #8; \ 101 - ext b1.16b, b1.16b, b1.16b, #8; \ 102 - ext b2.16b, b2.16b, b2.16b, #8; \ 103 - ext b3.16b, b3.16b, b3.16b, #8; \ 104 - rev32 b0.16b, b0.16b; \ 105 - rev32 b1.16b, b1.16b; \ 106 - rev32 b2.16b, b2.16b; \ 107 - rev32 b3.16b, b3.16b; 108 - 109 - #define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ 110 - rev32 b0.16b, b0.16b; \ 111 - rev32 b1.16b, b1.16b; \ 112 - rev32 b2.16b, b2.16b; \ 113 - rev32 b3.16b, b3.16b; \ 114 - rev32 b4.16b, b4.16b; \ 115 - rev32 b5.16b, b5.16b; \ 116 - rev32 b6.16b, b6.16b; \ 117 - rev32 b7.16b, b7.16b; \ 118 - sm4e b0.4s, v24.4s; \ 119 - sm4e b1.4s, v24.4s; \ 120 - sm4e b2.4s, v24.4s; \ 121 - sm4e b3.4s, v24.4s; \ 122 - sm4e b4.4s, v24.4s; \ 123 - sm4e b5.4s, v24.4s; \ 124 - sm4e b6.4s, v24.4s; \ 125 - sm4e b7.4s, v24.4s; \ 126 - sm4e b0.4s, v25.4s; \ 127 - sm4e b1.4s, v25.4s; \ 128 - sm4e b2.4s, v25.4s; \ 129 - sm4e b3.4s, v25.4s; \ 130 - sm4e b4.4s, v25.4s; \ 131 - sm4e b5.4s, v25.4s; \ 132 - sm4e b6.4s, v25.4s; \ 133 - sm4e b7.4s, v25.4s; \ 134 - sm4e b0.4s, v26.4s; \ 135 - sm4e b1.4s, v26.4s; \ 136 - sm4e b2.4s, v26.4s; \ 137 - sm4e b3.4s, v26.4s; \ 138 - sm4e b4.4s, v26.4s; \ 139 - sm4e b5.4s, v26.4s; \ 140 - sm4e b6.4s, v26.4s; \ 141 - sm4e b7.4s, v26.4s; \ 142 - sm4e b0.4s, v27.4s; \ 143 - sm4e b1.4s, v27.4s; \ 144 - sm4e b2.4s, v27.4s; \ 145 - sm4e b3.4s, v27.4s; \ 146 - sm4e b4.4s, v27.4s; \ 147 - sm4e b5.4s, v27.4s; \ 148 - sm4e b6.4s, v27.4s; \ 149 - sm4e b7.4s, v27.4s; \ 150 - sm4e b0.4s, v28.4s; \ 151 - sm4e b1.4s, v28.4s; \ 152 - sm4e b2.4s, v28.4s; \ 153 - sm4e b3.4s, v28.4s; \ 154 - sm4e b4.4s, v28.4s; \ 155 - sm4e b5.4s, v28.4s; \ 156 - sm4e b6.4s, v28.4s; \ 157 - sm4e b7.4s, v28.4s; \ 158 - sm4e b0.4s, v29.4s; \ 159 - sm4e b1.4s, v29.4s; \ 160 - sm4e b2.4s, v29.4s; \ 161 - sm4e b3.4s, v29.4s; \ 162 - sm4e b4.4s, v29.4s; \ 163 - sm4e b5.4s, v29.4s; \ 164 - sm4e b6.4s, v29.4s; \ 165 - sm4e b7.4s, v29.4s; \ 166 - sm4e b0.4s, v30.4s; \ 167 - sm4e b1.4s, v30.4s; \ 168 - sm4e b2.4s, v30.4s; \ 169 - sm4e b3.4s, v30.4s; \ 170 - sm4e b4.4s, v30.4s; \ 171 - sm4e b5.4s, v30.4s; \ 172 - sm4e b6.4s, v30.4s; \ 173 - sm4e b7.4s, v30.4s; \ 174 - sm4e b0.4s, v31.4s; \ 175 - sm4e b1.4s, v31.4s; \ 176 - sm4e b2.4s, v31.4s; \ 177 - sm4e b3.4s, v31.4s; \ 178 - sm4e b4.4s, v31.4s; \ 179 - sm4e b5.4s, v31.4s; \ 180 - sm4e b6.4s, v31.4s; \ 181 - sm4e b7.4s, v31.4s; \ 182 - rev64 b0.4s, b0.4s; \ 183 - rev64 b1.4s, b1.4s; \ 184 - rev64 b2.4s, b2.4s; \ 185 - rev64 b3.4s, b3.4s; \ 186 - rev64 b4.4s, b4.4s; \ 187 - rev64 b5.4s, b5.4s; \ 188 - rev64 b6.4s, b6.4s; \ 189 - rev64 b7.4s, b7.4s; \ 190 - ext b0.16b, b0.16b, b0.16b, #8; \ 191 - ext b1.16b, b1.16b, b1.16b, #8; \ 192 - ext b2.16b, b2.16b, b2.16b, #8; \ 193 - ext b3.16b, b3.16b, b3.16b, #8; \ 194 - ext b4.16b, b4.16b, b4.16b, #8; \ 195 - ext b5.16b, b5.16b, b5.16b, #8; \ 196 - ext b6.16b, b6.16b, b6.16b, #8; \ 197 - ext b7.16b, b7.16b, b7.16b, #8; \ 198 - rev32 b0.16b, b0.16b; \ 199 - rev32 b1.16b, b1.16b; \ 200 - rev32 b2.16b, b2.16b; \ 201 - rev32 b3.16b, b3.16b; \ 202 - rev32 b4.16b, b4.16b; \ 203 - rev32 b5.16b, b5.16b; \ 204 - rev32 b6.16b, b6.16b; \ 205 - rev32 b7.16b, b7.16b; 206 36 207 37 208 38 .align 3 ··· 102 268 * x1: dst 103 269 * x2: src 104 270 */ 105 - PREPARE; 271 + SM4_PREPARE(x0) 106 272 107 273 ld1 {v0.16b}, [x2]; 108 274 SM4_CRYPT_BLK(v0); ··· 119 285 * x2: src 120 286 * w3: nblocks 121 287 */ 122 - PREPARE; 288 + SM4_PREPARE(x0) 123 289 124 290 .Lcrypt_loop_blk: 125 291 sub w3, w3, #8; ··· 171 337 * x3: iv (big endian, 128 bit) 172 338 * w4: nblocks 173 339 */ 174 - PREPARE; 340 + SM4_PREPARE(x0) 175 341 176 - ld1 {RIV.16b}, [x3]; 342 + ld1 {RIV.16b}, [x3] 177 343 178 - .Lcbc_enc_loop: 179 - sub w4, w4, #1; 344 + .Lcbc_enc_loop_4x: 345 + cmp w4, #4 346 + blt .Lcbc_enc_loop_1x 180 347 181 - ld1 {RTMP0.16b}, [x2], #16; 182 - eor RIV.16b, RIV.16b, RTMP0.16b; 348 + sub w4, w4, #4 183 349 184 - SM4_CRYPT_BLK(RIV); 350 + ld1 {v0.16b-v3.16b}, [x2], #64 185 351 186 - st1 {RIV.16b}, [x1], #16; 352 + eor v0.16b, v0.16b, RIV.16b 353 + SM4_CRYPT_BLK(v0) 354 + eor v1.16b, v1.16b, v0.16b 355 + SM4_CRYPT_BLK(v1) 356 + eor v2.16b, v2.16b, v1.16b 357 + SM4_CRYPT_BLK(v2) 358 + eor v3.16b, v3.16b, v2.16b 359 + SM4_CRYPT_BLK(v3) 187 360 188 - cbnz w4, .Lcbc_enc_loop; 361 + st1 {v0.16b-v3.16b}, [x1], #64 362 + mov RIV.16b, v3.16b 189 363 364 + cbz w4, .Lcbc_enc_end 365 + b .Lcbc_enc_loop_4x 366 + 367 + .Lcbc_enc_loop_1x: 368 + sub w4, w4, #1 369 + 370 + ld1 {v0.16b}, [x2], #16 371 + 372 + eor RIV.16b, RIV.16b, v0.16b 373 + SM4_CRYPT_BLK(RIV) 374 + 375 + st1 {RIV.16b}, [x1], #16 376 + 377 + cbnz w4, .Lcbc_enc_loop_1x 378 + 379 + .Lcbc_enc_end: 190 380 /* store new IV */ 191 - st1 {RIV.16b}, [x3]; 381 + st1 {RIV.16b}, [x3] 192 382 193 - ret; 383 + ret 194 384 SYM_FUNC_END(sm4_ce_cbc_enc) 195 385 196 386 .align 3 ··· 226 368 * x3: iv (big endian, 128 bit) 227 369 * w4: nblocks 228 370 */ 229 - PREPARE; 371 + SM4_PREPARE(x0) 230 372 231 - ld1 {RIV.16b}, [x3]; 373 + ld1 {RIV.16b}, [x3] 232 374 233 - .Lcbc_loop_blk: 234 - sub w4, w4, #8; 235 - tbnz w4, #31, .Lcbc_tail8; 375 + .Lcbc_dec_loop_8x: 376 + sub w4, w4, #8 377 + tbnz w4, #31, .Lcbc_dec_4x 236 378 237 - ld1 {v0.16b-v3.16b}, [x2], #64; 238 - ld1 {v4.16b-v7.16b}, [x2]; 379 + ld1 {v0.16b-v3.16b}, [x2], #64 380 + ld1 {v4.16b-v7.16b}, [x2], #64 239 381 240 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 382 + rev32 v8.16b, v0.16b 383 + rev32 v9.16b, v1.16b 384 + rev32 v10.16b, v2.16b 385 + rev32 v11.16b, v3.16b 386 + rev32 v12.16b, v4.16b 387 + rev32 v13.16b, v5.16b 388 + rev32 v14.16b, v6.16b 389 + rev32 v15.16b, v7.16b 241 390 242 - sub x2, x2, #64; 243 - eor v0.16b, v0.16b, RIV.16b; 244 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 245 - eor v1.16b, v1.16b, RTMP0.16b; 246 - eor v2.16b, v2.16b, RTMP1.16b; 247 - eor v3.16b, v3.16b, RTMP2.16b; 248 - st1 {v0.16b-v3.16b}, [x1], #64; 391 + SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15) 249 392 250 - eor v4.16b, v4.16b, RTMP3.16b; 251 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 252 - eor v5.16b, v5.16b, RTMP0.16b; 253 - eor v6.16b, v6.16b, RTMP1.16b; 254 - eor v7.16b, v7.16b, RTMP2.16b; 393 + eor v8.16b, v8.16b, RIV.16b 394 + eor v9.16b, v9.16b, v0.16b 395 + eor v10.16b, v10.16b, v1.16b 396 + eor v11.16b, v11.16b, v2.16b 397 + eor v12.16b, v12.16b, v3.16b 398 + eor v13.16b, v13.16b, v4.16b 399 + eor v14.16b, v14.16b, v5.16b 400 + eor v15.16b, v15.16b, v6.16b 255 401 256 - mov RIV.16b, RTMP3.16b; 257 - st1 {v4.16b-v7.16b}, [x1], #64; 402 + st1 {v8.16b-v11.16b}, [x1], #64 403 + st1 {v12.16b-v15.16b}, [x1], #64 258 404 259 - cbz w4, .Lcbc_end; 260 - b .Lcbc_loop_blk; 405 + mov RIV.16b, v7.16b 261 406 262 - .Lcbc_tail8: 263 - add w4, w4, #8; 264 - cmp w4, #4; 265 - blt .Lcbc_tail4; 407 + cbz w4, .Lcbc_dec_end 408 + b .Lcbc_dec_loop_8x 266 409 267 - sub w4, w4, #4; 410 + .Lcbc_dec_4x: 411 + add w4, w4, #8 412 + cmp w4, #4 413 + blt .Lcbc_dec_loop_1x 268 414 269 - ld1 {v0.16b-v3.16b}, [x2]; 415 + sub w4, w4, #4 270 416 271 - SM4_CRYPT_BLK4(v0, v1, v2, v3); 417 + ld1 {v0.16b-v3.16b}, [x2], #64 272 418 273 - eor v0.16b, v0.16b, RIV.16b; 274 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 275 - eor v1.16b, v1.16b, RTMP0.16b; 276 - eor v2.16b, v2.16b, RTMP1.16b; 277 - eor v3.16b, v3.16b, RTMP2.16b; 419 + rev32 v8.16b, v0.16b 420 + rev32 v9.16b, v1.16b 421 + rev32 v10.16b, v2.16b 422 + rev32 v11.16b, v3.16b 278 423 279 - mov RIV.16b, RTMP3.16b; 280 - st1 {v0.16b-v3.16b}, [x1], #64; 424 + SM4_CRYPT_BLK4_BE(v8, v9, v10, v11) 281 425 282 - cbz w4, .Lcbc_end; 426 + eor v8.16b, v8.16b, RIV.16b 427 + eor v9.16b, v9.16b, v0.16b 428 + eor v10.16b, v10.16b, v1.16b 429 + eor v11.16b, v11.16b, v2.16b 283 430 284 - .Lcbc_tail4: 285 - sub w4, w4, #1; 431 + st1 {v8.16b-v11.16b}, [x1], #64 286 432 287 - ld1 {v0.16b}, [x2]; 433 + mov RIV.16b, v3.16b 288 434 289 - SM4_CRYPT_BLK(v0); 435 + cbz w4, .Lcbc_dec_end 290 436 291 - eor v0.16b, v0.16b, RIV.16b; 292 - ld1 {RIV.16b}, [x2], #16; 293 - st1 {v0.16b}, [x1], #16; 437 + .Lcbc_dec_loop_1x: 438 + sub w4, w4, #1 294 439 295 - cbnz w4, .Lcbc_tail4; 440 + ld1 {v0.16b}, [x2], #16 296 441 297 - .Lcbc_end: 442 + rev32 v8.16b, v0.16b 443 + 444 + SM4_CRYPT_BLK_BE(v8) 445 + 446 + eor v8.16b, v8.16b, RIV.16b 447 + st1 {v8.16b}, [x1], #16 448 + 449 + mov RIV.16b, v0.16b 450 + 451 + cbnz w4, .Lcbc_dec_loop_1x 452 + 453 + .Lcbc_dec_end: 298 454 /* store new IV */ 299 - st1 {RIV.16b}, [x3]; 455 + st1 {RIV.16b}, [x3] 300 456 301 - ret; 457 + ret 302 458 SYM_FUNC_END(sm4_ce_cbc_dec) 303 459 304 460 .align 3 ··· 324 452 * x3: iv (big endian, 128 bit) 325 453 * w4: nblocks 326 454 */ 327 - PREPARE; 455 + SM4_PREPARE(x0) 328 456 329 - ld1 {RIV.16b}, [x3]; 457 + ld1 {RIV.16b}, [x3] 330 458 331 - .Lcfb_enc_loop: 332 - sub w4, w4, #1; 459 + .Lcfb_enc_loop_4x: 460 + cmp w4, #4 461 + blt .Lcfb_enc_loop_1x 333 462 334 - SM4_CRYPT_BLK(RIV); 463 + sub w4, w4, #4 335 464 336 - ld1 {RTMP0.16b}, [x2], #16; 337 - eor RIV.16b, RIV.16b, RTMP0.16b; 338 - st1 {RIV.16b}, [x1], #16; 465 + ld1 {v0.16b-v3.16b}, [x2], #64 339 466 340 - cbnz w4, .Lcfb_enc_loop; 467 + rev32 v8.16b, RIV.16b 468 + SM4_CRYPT_BLK_BE(v8) 469 + eor v0.16b, v0.16b, v8.16b 341 470 471 + rev32 v8.16b, v0.16b 472 + SM4_CRYPT_BLK_BE(v8) 473 + eor v1.16b, v1.16b, v8.16b 474 + 475 + rev32 v8.16b, v1.16b 476 + SM4_CRYPT_BLK_BE(v8) 477 + eor v2.16b, v2.16b, v8.16b 478 + 479 + rev32 v8.16b, v2.16b 480 + SM4_CRYPT_BLK_BE(v8) 481 + eor v3.16b, v3.16b, v8.16b 482 + 483 + st1 {v0.16b-v3.16b}, [x1], #64 484 + mov RIV.16b, v3.16b 485 + 486 + cbz w4, .Lcfb_enc_end 487 + b .Lcfb_enc_loop_4x 488 + 489 + .Lcfb_enc_loop_1x: 490 + sub w4, w4, #1 491 + 492 + ld1 {v0.16b}, [x2], #16 493 + 494 + SM4_CRYPT_BLK(RIV) 495 + eor RIV.16b, RIV.16b, v0.16b 496 + 497 + st1 {RIV.16b}, [x1], #16 498 + 499 + cbnz w4, .Lcfb_enc_loop_1x 500 + 501 + .Lcfb_enc_end: 342 502 /* store new IV */ 343 - st1 {RIV.16b}, [x3]; 503 + st1 {RIV.16b}, [x3] 344 504 345 - ret; 505 + ret 346 506 SYM_FUNC_END(sm4_ce_cfb_enc) 347 507 348 508 .align 3 ··· 386 482 * x3: iv (big endian, 128 bit) 387 483 * w4: nblocks 388 484 */ 389 - PREPARE; 485 + SM4_PREPARE(x0) 390 486 391 - ld1 {v0.16b}, [x3]; 487 + ld1 {RIV.16b}, [x3] 392 488 393 - .Lcfb_loop_blk: 394 - sub w4, w4, #8; 395 - tbnz w4, #31, .Lcfb_tail8; 489 + .Lcfb_dec_loop_8x: 490 + sub w4, w4, #8 491 + tbnz w4, #31, .Lcfb_dec_4x 396 492 397 - ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48; 398 - ld1 {v4.16b-v7.16b}, [x2]; 493 + ld1 {v0.16b-v3.16b}, [x2], #64 494 + ld1 {v4.16b-v7.16b}, [x2], #64 399 495 400 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 496 + rev32 v8.16b, RIV.16b 497 + rev32 v9.16b, v0.16b 498 + rev32 v10.16b, v1.16b 499 + rev32 v11.16b, v2.16b 500 + rev32 v12.16b, v3.16b 501 + rev32 v13.16b, v4.16b 502 + rev32 v14.16b, v5.16b 503 + rev32 v15.16b, v6.16b 401 504 402 - sub x2, x2, #48; 403 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 404 - eor v0.16b, v0.16b, RTMP0.16b; 405 - eor v1.16b, v1.16b, RTMP1.16b; 406 - eor v2.16b, v2.16b, RTMP2.16b; 407 - eor v3.16b, v3.16b, RTMP3.16b; 408 - st1 {v0.16b-v3.16b}, [x1], #64; 505 + SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15) 409 506 410 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 411 - eor v4.16b, v4.16b, RTMP0.16b; 412 - eor v5.16b, v5.16b, RTMP1.16b; 413 - eor v6.16b, v6.16b, RTMP2.16b; 414 - eor v7.16b, v7.16b, RTMP3.16b; 415 - st1 {v4.16b-v7.16b}, [x1], #64; 507 + mov RIV.16b, v7.16b 416 508 417 - mov v0.16b, RTMP3.16b; 509 + eor v0.16b, v0.16b, v8.16b 510 + eor v1.16b, v1.16b, v9.16b 511 + eor v2.16b, v2.16b, v10.16b 512 + eor v3.16b, v3.16b, v11.16b 513 + eor v4.16b, v4.16b, v12.16b 514 + eor v5.16b, v5.16b, v13.16b 515 + eor v6.16b, v6.16b, v14.16b 516 + eor v7.16b, v7.16b, v15.16b 418 517 419 - cbz w4, .Lcfb_end; 420 - b .Lcfb_loop_blk; 518 + st1 {v0.16b-v3.16b}, [x1], #64 519 + st1 {v4.16b-v7.16b}, [x1], #64 421 520 422 - .Lcfb_tail8: 423 - add w4, w4, #8; 424 - cmp w4, #4; 425 - blt .Lcfb_tail4; 521 + cbz w4, .Lcfb_dec_end 522 + b .Lcfb_dec_loop_8x 426 523 427 - sub w4, w4, #4; 524 + .Lcfb_dec_4x: 525 + add w4, w4, #8 526 + cmp w4, #4 527 + blt .Lcfb_dec_loop_1x 428 528 429 - ld1 {v1.16b, v2.16b, v3.16b}, [x2]; 529 + sub w4, w4, #4 430 530 431 - SM4_CRYPT_BLK4(v0, v1, v2, v3); 531 + ld1 {v0.16b-v3.16b}, [x2], #64 432 532 433 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 434 - eor v0.16b, v0.16b, RTMP0.16b; 435 - eor v1.16b, v1.16b, RTMP1.16b; 436 - eor v2.16b, v2.16b, RTMP2.16b; 437 - eor v3.16b, v3.16b, RTMP3.16b; 438 - st1 {v0.16b-v3.16b}, [x1], #64; 533 + rev32 v8.16b, RIV.16b 534 + rev32 v9.16b, v0.16b 535 + rev32 v10.16b, v1.16b 536 + rev32 v11.16b, v2.16b 439 537 440 - mov v0.16b, RTMP3.16b; 538 + SM4_CRYPT_BLK4_BE(v8, v9, v10, v11) 441 539 442 - cbz w4, .Lcfb_end; 540 + mov RIV.16b, v3.16b 443 541 444 - .Lcfb_tail4: 445 - sub w4, w4, #1; 542 + eor v0.16b, v0.16b, v8.16b 543 + eor v1.16b, v1.16b, v9.16b 544 + eor v2.16b, v2.16b, v10.16b 545 + eor v3.16b, v3.16b, v11.16b 446 546 447 - SM4_CRYPT_BLK(v0); 547 + st1 {v0.16b-v3.16b}, [x1], #64 448 548 449 - ld1 {RTMP0.16b}, [x2], #16; 450 - eor v0.16b, v0.16b, RTMP0.16b; 451 - st1 {v0.16b}, [x1], #16; 549 + cbz w4, .Lcfb_dec_end 452 550 453 - mov v0.16b, RTMP0.16b; 551 + .Lcfb_dec_loop_1x: 552 + sub w4, w4, #1 454 553 455 - cbnz w4, .Lcfb_tail4; 554 + ld1 {v0.16b}, [x2], #16 456 555 457 - .Lcfb_end: 556 + SM4_CRYPT_BLK(RIV) 557 + 558 + eor RIV.16b, RIV.16b, v0.16b 559 + st1 {RIV.16b}, [x1], #16 560 + 561 + mov RIV.16b, v0.16b 562 + 563 + cbnz w4, .Lcfb_dec_loop_1x 564 + 565 + .Lcfb_dec_end: 458 566 /* store new IV */ 459 - st1 {v0.16b}, [x3]; 567 + st1 {RIV.16b}, [x3] 460 568 461 - ret; 569 + ret 462 570 SYM_FUNC_END(sm4_ce_cfb_dec) 463 571 464 572 .align 3 ··· 482 566 * x3: ctr (big endian, 128 bit) 483 567 * w4: nblocks 484 568 */ 485 - PREPARE; 569 + SM4_PREPARE(x0) 486 570 487 - ldp x7, x8, [x3]; 488 - rev x7, x7; 489 - rev x8, x8; 571 + ldp x7, x8, [x3] 572 + rev x7, x7 573 + rev x8, x8 490 574 491 - .Lctr_loop_blk: 492 - sub w4, w4, #8; 493 - tbnz w4, #31, .Lctr_tail8; 575 + .Lctr_loop_8x: 576 + sub w4, w4, #8 577 + tbnz w4, #31, .Lctr_4x 494 578 495 - #define inc_le128(vctr) \ 496 - mov vctr.d[1], x8; \ 497 - mov vctr.d[0], x7; \ 498 - adds x8, x8, #1; \ 499 - adc x7, x7, xzr; \ 500 - rev64 vctr.16b, vctr.16b; 501 - 502 - /* construct CTRs */ 503 - inc_le128(v0); /* +0 */ 504 - inc_le128(v1); /* +1 */ 505 - inc_le128(v2); /* +2 */ 506 - inc_le128(v3); /* +3 */ 507 - inc_le128(v4); /* +4 */ 508 - inc_le128(v5); /* +5 */ 509 - inc_le128(v6); /* +6 */ 510 - inc_le128(v7); /* +7 */ 511 - 512 - SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 513 - 514 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 515 - eor v0.16b, v0.16b, RTMP0.16b; 516 - eor v1.16b, v1.16b, RTMP1.16b; 517 - eor v2.16b, v2.16b, RTMP2.16b; 518 - eor v3.16b, v3.16b, RTMP3.16b; 519 - st1 {v0.16b-v3.16b}, [x1], #64; 520 - 521 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 522 - eor v4.16b, v4.16b, RTMP0.16b; 523 - eor v5.16b, v5.16b, RTMP1.16b; 524 - eor v6.16b, v6.16b, RTMP2.16b; 525 - eor v7.16b, v7.16b, RTMP3.16b; 526 - st1 {v4.16b-v7.16b}, [x1], #64; 527 - 528 - cbz w4, .Lctr_end; 529 - b .Lctr_loop_blk; 530 - 531 - .Lctr_tail8: 532 - add w4, w4, #8; 533 - cmp w4, #4; 534 - blt .Lctr_tail4; 535 - 536 - sub w4, w4, #4; 579 + #define inc_le128(vctr) \ 580 + mov vctr.d[1], x8; \ 581 + mov vctr.d[0], x7; \ 582 + adds x8, x8, #1; \ 583 + rev64 vctr.16b, vctr.16b; \ 584 + adc x7, x7, xzr; 537 585 538 586 /* construct CTRs */ 539 - inc_le128(v0); /* +0 */ 540 - inc_le128(v1); /* +1 */ 541 - inc_le128(v2); /* +2 */ 542 - inc_le128(v3); /* +3 */ 587 + inc_le128(v0) /* +0 */ 588 + inc_le128(v1) /* +1 */ 589 + inc_le128(v2) /* +2 */ 590 + inc_le128(v3) /* +3 */ 591 + inc_le128(v4) /* +4 */ 592 + inc_le128(v5) /* +5 */ 593 + inc_le128(v6) /* +6 */ 594 + inc_le128(v7) /* +7 */ 543 595 544 - SM4_CRYPT_BLK4(v0, v1, v2, v3); 596 + ld1 {v8.16b-v11.16b}, [x2], #64 597 + ld1 {v12.16b-v15.16b}, [x2], #64 545 598 546 - ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64; 547 - eor v0.16b, v0.16b, RTMP0.16b; 548 - eor v1.16b, v1.16b, RTMP1.16b; 549 - eor v2.16b, v2.16b, RTMP2.16b; 550 - eor v3.16b, v3.16b, RTMP3.16b; 551 - st1 {v0.16b-v3.16b}, [x1], #64; 599 + SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 552 600 553 - cbz w4, .Lctr_end; 601 + eor v0.16b, v0.16b, v8.16b 602 + eor v1.16b, v1.16b, v9.16b 603 + eor v2.16b, v2.16b, v10.16b 604 + eor v3.16b, v3.16b, v11.16b 605 + eor v4.16b, v4.16b, v12.16b 606 + eor v5.16b, v5.16b, v13.16b 607 + eor v6.16b, v6.16b, v14.16b 608 + eor v7.16b, v7.16b, v15.16b 554 609 555 - .Lctr_tail4: 556 - sub w4, w4, #1; 610 + st1 {v0.16b-v3.16b}, [x1], #64 611 + st1 {v4.16b-v7.16b}, [x1], #64 612 + 613 + cbz w4, .Lctr_end 614 + b .Lctr_loop_8x 615 + 616 + .Lctr_4x: 617 + add w4, w4, #8 618 + cmp w4, #4 619 + blt .Lctr_loop_1x 620 + 621 + sub w4, w4, #4 557 622 558 623 /* construct CTRs */ 559 - inc_le128(v0); 624 + inc_le128(v0) /* +0 */ 625 + inc_le128(v1) /* +1 */ 626 + inc_le128(v2) /* +2 */ 627 + inc_le128(v3) /* +3 */ 560 628 561 - SM4_CRYPT_BLK(v0); 629 + ld1 {v8.16b-v11.16b}, [x2], #64 562 630 563 - ld1 {RTMP0.16b}, [x2], #16; 564 - eor v0.16b, v0.16b, RTMP0.16b; 565 - st1 {v0.16b}, [x1], #16; 631 + SM4_CRYPT_BLK4(v0, v1, v2, v3) 566 632 567 - cbnz w4, .Lctr_tail4; 633 + eor v0.16b, v0.16b, v8.16b 634 + eor v1.16b, v1.16b, v9.16b 635 + eor v2.16b, v2.16b, v10.16b 636 + eor v3.16b, v3.16b, v11.16b 637 + 638 + st1 {v0.16b-v3.16b}, [x1], #64 639 + 640 + cbz w4, .Lctr_end 641 + 642 + .Lctr_loop_1x: 643 + sub w4, w4, #1 644 + 645 + /* construct CTRs */ 646 + inc_le128(v0) 647 + 648 + ld1 {v8.16b}, [x2], #16 649 + 650 + SM4_CRYPT_BLK(v0) 651 + 652 + eor v0.16b, v0.16b, v8.16b 653 + st1 {v0.16b}, [x1], #16 654 + 655 + cbnz w4, .Lctr_loop_1x 568 656 569 657 .Lctr_end: 570 658 /* store new CTR */ 571 - rev x7, x7; 572 - rev x8, x8; 573 - stp x7, x8, [x3]; 659 + rev x7, x7 660 + rev x8, x8 661 + stp x7, x8, [x3] 574 662 575 - ret; 663 + ret 576 664 SYM_FUNC_END(sm4_ce_ctr_enc)
+28 -38
arch/arm64/crypto/sm4-ce-glue.c
··· 26 26 asmlinkage void sm4_ce_crypt(const u32 *rkey, u8 *dst, const u8 *src, 27 27 unsigned int nblks); 28 28 asmlinkage void sm4_ce_cbc_enc(const u32 *rkey, u8 *dst, const u8 *src, 29 - u8 *iv, unsigned int nblks); 29 + u8 *iv, unsigned int nblocks); 30 30 asmlinkage void sm4_ce_cbc_dec(const u32 *rkey, u8 *dst, const u8 *src, 31 - u8 *iv, unsigned int nblks); 31 + u8 *iv, unsigned int nblocks); 32 32 asmlinkage void sm4_ce_cfb_enc(const u32 *rkey, u8 *dst, const u8 *src, 33 33 u8 *iv, unsigned int nblks); 34 34 asmlinkage void sm4_ce_cfb_dec(const u32 *rkey, u8 *dst, const u8 *src, ··· 94 94 return sm4_ecb_do_crypt(req, ctx->rkey_dec); 95 95 } 96 96 97 - static int sm4_cbc_encrypt(struct skcipher_request *req) 97 + static int sm4_cbc_crypt(struct skcipher_request *req, 98 + struct sm4_ctx *ctx, bool encrypt) 98 99 { 99 - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 100 - struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); 101 100 struct skcipher_walk walk; 102 101 unsigned int nbytes; 103 102 int err; 104 103 105 104 err = skcipher_walk_virt(&walk, req, false); 105 + if (err) 106 + return err; 106 107 107 108 while ((nbytes = walk.nbytes) > 0) { 108 109 const u8 *src = walk.src.virt.addr; 109 110 u8 *dst = walk.dst.virt.addr; 110 - unsigned int nblks; 111 + unsigned int nblocks; 111 112 112 - kernel_neon_begin(); 113 + nblocks = nbytes / SM4_BLOCK_SIZE; 114 + if (nblocks) { 115 + kernel_neon_begin(); 113 116 114 - nblks = BYTES2BLKS(nbytes); 115 - if (nblks) { 116 - sm4_ce_cbc_enc(ctx->rkey_enc, dst, src, walk.iv, nblks); 117 - nbytes -= nblks * SM4_BLOCK_SIZE; 117 + if (encrypt) 118 + sm4_ce_cbc_enc(ctx->rkey_enc, dst, src, 119 + walk.iv, nblocks); 120 + else 121 + sm4_ce_cbc_dec(ctx->rkey_dec, dst, src, 122 + walk.iv, nblocks); 123 + 124 + kernel_neon_end(); 118 125 } 119 126 120 - kernel_neon_end(); 121 - 122 - err = skcipher_walk_done(&walk, nbytes); 127 + err = skcipher_walk_done(&walk, nbytes % SM4_BLOCK_SIZE); 123 128 } 124 129 125 130 return err; 131 + } 132 + 133 + static int sm4_cbc_encrypt(struct skcipher_request *req) 134 + { 135 + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 136 + struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); 137 + 138 + return sm4_cbc_crypt(req, ctx, true); 126 139 } 127 140 128 141 static int sm4_cbc_decrypt(struct skcipher_request *req) 129 142 { 130 143 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); 131 144 struct sm4_ctx *ctx = crypto_skcipher_ctx(tfm); 132 - struct skcipher_walk walk; 133 - unsigned int nbytes; 134 - int err; 135 145 136 - err = skcipher_walk_virt(&walk, req, false); 137 - 138 - while ((nbytes = walk.nbytes) > 0) { 139 - const u8 *src = walk.src.virt.addr; 140 - u8 *dst = walk.dst.virt.addr; 141 - unsigned int nblks; 142 - 143 - kernel_neon_begin(); 144 - 145 - nblks = BYTES2BLKS(nbytes); 146 - if (nblks) { 147 - sm4_ce_cbc_dec(ctx->rkey_dec, dst, src, walk.iv, nblks); 148 - nbytes -= nblks * SM4_BLOCK_SIZE; 149 - } 150 - 151 - kernel_neon_end(); 152 - 153 - err = skcipher_walk_done(&walk, nbytes); 154 - } 155 - 156 - return err; 146 + return sm4_cbc_crypt(req, ctx, false); 157 147 } 158 148 159 149 static int sm4_cfb_encrypt(struct skcipher_request *req)