Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: x86/curve25519 - use in/out register constraints more precisely

Rather than passing all variables as modified, pass ones that are only
read into that parameter. This helps with old gcc versions when
alternatives are additionally used, and lets gcc's codegen be a little
bit more efficient. This also syncs up with the latest Vale/EverCrypt
output.

Reported-by: Mathias Krause <minipli@grsecurity.net>
Cc: Aymeric Fromherz <aymeric.fromherz@inria.fr>
Link: https://lore.kernel.org/wireguard/1554725710.1290070.1639240504281.JavaMail.zimbra@inria.fr/
Link: https://github.com/project-everest/hacl-star/pull/501
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Jason A. Donenfeld and committed by
Herbert Xu
acd93f8a 38e9791a

+489 -278
+489 -278
arch/x86/crypto/curve25519-x86_64.c
··· 64 64 65 65 /* Return the carry bit in a register */ 66 66 " adcx %%r11, %1;" 67 - : "+&r" (f2), "=&r" (carry_r) 68 - : "r" (out), "r" (f1) 69 - : "%r8", "%r9", "%r10", "%r11", "memory", "cc" 70 - ); 67 + : "+&r"(f2), "=&r"(carry_r) 68 + : "r"(out), "r"(f1) 69 + : "%r8", "%r9", "%r10", "%r11", "memory", "cc"); 71 70 72 71 return carry_r; 73 72 } ··· 107 108 " cmovc %0, %%rax;" 108 109 " add %%rax, %%r8;" 109 110 " movq %%r8, 0(%1);" 110 - : "+&r" (f2) 111 - : "r" (out), "r" (f1) 112 - : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc" 113 - ); 111 + : "+&r"(f2) 112 + : "r"(out), "r"(f1) 113 + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); 114 114 } 115 115 116 116 /* Computes the field subtraction of two field elements */ ··· 149 151 " movq %%r9, 8(%0);" 150 152 " movq %%r10, 16(%0);" 151 153 " movq %%r11, 24(%0);" 152 - : 153 - : "r" (out), "r" (f1), "r" (f2) 154 - : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc" 155 - ); 154 + : 155 + : "r"(out), "r"(f1), "r"(f2) 156 + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"); 156 157 } 157 158 158 159 /* Computes a field multiplication: out <- f1 * f2 ··· 159 162 static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) 160 163 { 161 164 asm volatile( 165 + 162 166 /* Compute the raw multiplication: tmp <- src1 * src2 */ 163 167 164 168 /* Compute src1[0] * src2 */ 165 - " movq 0(%1), %%rdx;" 166 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);" 167 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);" 168 - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" 169 - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" 170 - " adox %%rdx, %%rax;" 169 + " movq 0(%0), %%rdx;" 170 + " mulxq 0(%1), %%r8, %%r9;" 171 + " xor %%r10d, %%r10d;" 172 + " movq %%r8, 0(%2);" 173 + " mulxq 8(%1), %%r10, %%r11;" 174 + " adox %%r9, %%r10;" 175 + " movq %%r10, 8(%2);" 176 + " mulxq 16(%1), %%rbx, %%r13;" 177 + " adox %%r11, %%rbx;" 178 + " mulxq 24(%1), %%r14, %%rdx;" 179 + " adox %%r13, %%r14;" 180 + " mov $0, %%rax;" 181 + " adox %%rdx, %%rax;" 182 + 171 183 /* Compute src1[1] * src2 */ 172 - " movq 8(%1), %%rdx;" 173 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" 174 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);" 175 - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" 176 - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" 177 - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" 184 + " movq 8(%0), %%rdx;" 185 + " mulxq 0(%1), %%r8, %%r9;" 186 + " xor %%r10d, %%r10d;" 187 + " adcxq 8(%2), %%r8;" 188 + " movq %%r8, 8(%2);" 189 + " mulxq 8(%1), %%r10, %%r11;" 190 + " adox %%r9, %%r10;" 191 + " adcx %%rbx, %%r10;" 192 + " movq %%r10, 16(%2);" 193 + " mulxq 16(%1), %%rbx, %%r13;" 194 + " adox %%r11, %%rbx;" 195 + " adcx %%r14, %%rbx;" 196 + " mov $0, %%r8;" 197 + " mulxq 24(%1), %%r14, %%rdx;" 198 + " adox %%r13, %%r14;" 199 + " adcx %%rax, %%r14;" 200 + " mov $0, %%rax;" 201 + " adox %%rdx, %%rax;" 202 + " adcx %%r8, %%rax;" 203 + 178 204 /* Compute src1[2] * src2 */ 179 - " movq 16(%1), %%rdx;" 180 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" 181 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);" 182 - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" 183 - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" 184 - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" 205 + " movq 16(%0), %%rdx;" 206 + " mulxq 0(%1), %%r8, %%r9;" 207 + " xor %%r10d, %%r10d;" 208 + " adcxq 16(%2), %%r8;" 209 + " movq %%r8, 16(%2);" 210 + " mulxq 8(%1), %%r10, %%r11;" 211 + " adox %%r9, %%r10;" 212 + " adcx %%rbx, %%r10;" 213 + " movq %%r10, 24(%2);" 214 + " mulxq 16(%1), %%rbx, %%r13;" 215 + " adox %%r11, %%rbx;" 216 + " adcx %%r14, %%rbx;" 217 + " mov $0, %%r8;" 218 + " mulxq 24(%1), %%r14, %%rdx;" 219 + " adox %%r13, %%r14;" 220 + " adcx %%rax, %%r14;" 221 + " mov $0, %%rax;" 222 + " adox %%rdx, %%rax;" 223 + " adcx %%r8, %%rax;" 224 + 185 225 /* Compute src1[3] * src2 */ 186 - " movq 24(%1), %%rdx;" 187 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" 188 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);" 189 - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;" 190 - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;" 191 - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);" 226 + " movq 24(%0), %%rdx;" 227 + " mulxq 0(%1), %%r8, %%r9;" 228 + " xor %%r10d, %%r10d;" 229 + " adcxq 24(%2), %%r8;" 230 + " movq %%r8, 24(%2);" 231 + " mulxq 8(%1), %%r10, %%r11;" 232 + " adox %%r9, %%r10;" 233 + " adcx %%rbx, %%r10;" 234 + " movq %%r10, 32(%2);" 235 + " mulxq 16(%1), %%rbx, %%r13;" 236 + " adox %%r11, %%rbx;" 237 + " adcx %%r14, %%rbx;" 238 + " movq %%rbx, 40(%2);" 239 + " mov $0, %%r8;" 240 + " mulxq 24(%1), %%r14, %%rdx;" 241 + " adox %%r13, %%r14;" 242 + " adcx %%rax, %%r14;" 243 + " movq %%r14, 48(%2);" 244 + " mov $0, %%rax;" 245 + " adox %%rdx, %%rax;" 246 + " adcx %%r8, %%rax;" 247 + " movq %%rax, 56(%2);" 248 + 192 249 /* Line up pointers */ 193 - " mov %0, %1;" 194 250 " mov %2, %0;" 251 + " mov %3, %2;" 195 252 196 253 /* Wrap the result back into the field */ 197 254 198 255 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ 199 256 " mov $38, %%rdx;" 200 - " mulxq 32(%1), %%r8, %%r13;" 201 - " xor %k3, %k3;" 202 - " adoxq 0(%1), %%r8;" 203 - " mulxq 40(%1), %%r9, %%rbx;" 257 + " mulxq 32(%0), %%r8, %%r13;" 258 + " xor %k1, %k1;" 259 + " adoxq 0(%0), %%r8;" 260 + " mulxq 40(%0), %%r9, %%rbx;" 204 261 " adcx %%r13, %%r9;" 205 - " adoxq 8(%1), %%r9;" 206 - " mulxq 48(%1), %%r10, %%r13;" 262 + " adoxq 8(%0), %%r9;" 263 + " mulxq 48(%0), %%r10, %%r13;" 207 264 " adcx %%rbx, %%r10;" 208 - " adoxq 16(%1), %%r10;" 209 - " mulxq 56(%1), %%r11, %%rax;" 265 + " adoxq 16(%0), %%r10;" 266 + " mulxq 56(%0), %%r11, %%rax;" 210 267 " adcx %%r13, %%r11;" 211 - " adoxq 24(%1), %%r11;" 212 - " adcx %3, %%rax;" 213 - " adox %3, %%rax;" 268 + " adoxq 24(%0), %%r11;" 269 + " adcx %1, %%rax;" 270 + " adox %1, %%rax;" 214 271 " imul %%rdx, %%rax;" 215 272 216 273 /* Step 2: Fold the carry back into dst */ 217 274 " add %%rax, %%r8;" 218 - " adcx %3, %%r9;" 219 - " movq %%r9, 8(%0);" 220 - " adcx %3, %%r10;" 221 - " movq %%r10, 16(%0);" 222 - " adcx %3, %%r11;" 223 - " movq %%r11, 24(%0);" 275 + " adcx %1, %%r9;" 276 + " movq %%r9, 8(%2);" 277 + " adcx %1, %%r10;" 278 + " movq %%r10, 16(%2);" 279 + " adcx %1, %%r11;" 280 + " movq %%r11, 24(%2);" 224 281 225 282 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ 226 283 " mov $0, %%rax;" 227 284 " cmovc %%rdx, %%rax;" 228 285 " add %%rax, %%r8;" 229 - " movq %%r8, 0(%0);" 230 - : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2) 231 - : 232 - : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc" 233 - ); 286 + " movq %%r8, 0(%2);" 287 + : "+&r"(f1), "+&r"(f2), "+&r"(tmp) 288 + : "r"(out) 289 + : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", 290 + "%r14", "memory", "cc"); 234 291 } 235 292 236 293 /* Computes two field multiplications: 237 - * out[0] <- f1[0] * f2[0] 238 - * out[1] <- f1[1] * f2[1] 239 - * Uses the 16-element buffer tmp for intermediate results. */ 294 + * out[0] <- f1[0] * f2[0] 295 + * out[1] <- f1[1] * f2[1] 296 + * Uses the 16-element buffer tmp for intermediate results: */ 240 297 static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) 241 298 { 242 299 asm volatile( 300 + 243 301 /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */ 244 302 245 303 /* Compute src1[0] * src2 */ 246 - " movq 0(%1), %%rdx;" 247 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 0(%0);" 248 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);" 249 - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" 250 - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" 251 - " adox %%rdx, %%rax;" 304 + " movq 0(%0), %%rdx;" 305 + " mulxq 0(%1), %%r8, %%r9;" 306 + " xor %%r10d, %%r10d;" 307 + " movq %%r8, 0(%2);" 308 + " mulxq 8(%1), %%r10, %%r11;" 309 + " adox %%r9, %%r10;" 310 + " movq %%r10, 8(%2);" 311 + " mulxq 16(%1), %%rbx, %%r13;" 312 + " adox %%r11, %%rbx;" 313 + " mulxq 24(%1), %%r14, %%rdx;" 314 + " adox %%r13, %%r14;" 315 + " mov $0, %%rax;" 316 + " adox %%rdx, %%rax;" 317 + 252 318 /* Compute src1[1] * src2 */ 253 - " movq 8(%1), %%rdx;" 254 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" 255 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);" 256 - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" 257 - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" 258 - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" 319 + " movq 8(%0), %%rdx;" 320 + " mulxq 0(%1), %%r8, %%r9;" 321 + " xor %%r10d, %%r10d;" 322 + " adcxq 8(%2), %%r8;" 323 + " movq %%r8, 8(%2);" 324 + " mulxq 8(%1), %%r10, %%r11;" 325 + " adox %%r9, %%r10;" 326 + " adcx %%rbx, %%r10;" 327 + " movq %%r10, 16(%2);" 328 + " mulxq 16(%1), %%rbx, %%r13;" 329 + " adox %%r11, %%rbx;" 330 + " adcx %%r14, %%rbx;" 331 + " mov $0, %%r8;" 332 + " mulxq 24(%1), %%r14, %%rdx;" 333 + " adox %%r13, %%r14;" 334 + " adcx %%rax, %%r14;" 335 + " mov $0, %%rax;" 336 + " adox %%rdx, %%rax;" 337 + " adcx %%r8, %%rax;" 338 + 259 339 /* Compute src1[2] * src2 */ 260 - " movq 16(%1), %%rdx;" 261 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" 262 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);" 263 - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" 264 - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" 265 - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" 340 + " movq 16(%0), %%rdx;" 341 + " mulxq 0(%1), %%r8, %%r9;" 342 + " xor %%r10d, %%r10d;" 343 + " adcxq 16(%2), %%r8;" 344 + " movq %%r8, 16(%2);" 345 + " mulxq 8(%1), %%r10, %%r11;" 346 + " adox %%r9, %%r10;" 347 + " adcx %%rbx, %%r10;" 348 + " movq %%r10, 24(%2);" 349 + " mulxq 16(%1), %%rbx, %%r13;" 350 + " adox %%r11, %%rbx;" 351 + " adcx %%r14, %%rbx;" 352 + " mov $0, %%r8;" 353 + " mulxq 24(%1), %%r14, %%rdx;" 354 + " adox %%r13, %%r14;" 355 + " adcx %%rax, %%r14;" 356 + " mov $0, %%rax;" 357 + " adox %%rdx, %%rax;" 358 + " adcx %%r8, %%rax;" 359 + 266 360 /* Compute src1[3] * src2 */ 267 - " movq 24(%1), %%rdx;" 268 - " mulxq 0(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" 269 - " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);" 270 - " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;" 271 - " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;" 272 - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);" 361 + " movq 24(%0), %%rdx;" 362 + " mulxq 0(%1), %%r8, %%r9;" 363 + " xor %%r10d, %%r10d;" 364 + " adcxq 24(%2), %%r8;" 365 + " movq %%r8, 24(%2);" 366 + " mulxq 8(%1), %%r10, %%r11;" 367 + " adox %%r9, %%r10;" 368 + " adcx %%rbx, %%r10;" 369 + " movq %%r10, 32(%2);" 370 + " mulxq 16(%1), %%rbx, %%r13;" 371 + " adox %%r11, %%rbx;" 372 + " adcx %%r14, %%rbx;" 373 + " movq %%rbx, 40(%2);" 374 + " mov $0, %%r8;" 375 + " mulxq 24(%1), %%r14, %%rdx;" 376 + " adox %%r13, %%r14;" 377 + " adcx %%rax, %%r14;" 378 + " movq %%r14, 48(%2);" 379 + " mov $0, %%rax;" 380 + " adox %%rdx, %%rax;" 381 + " adcx %%r8, %%rax;" 382 + " movq %%rax, 56(%2);" 273 383 274 384 /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */ 275 385 276 386 /* Compute src1[0] * src2 */ 277 - " movq 32(%1), %%rdx;" 278 - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " movq %%r8, 64(%0);" 279 - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);" 280 - " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" 281 - " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" 282 - " adox %%rdx, %%rax;" 387 + " movq 32(%0), %%rdx;" 388 + " mulxq 32(%1), %%r8, %%r9;" 389 + " xor %%r10d, %%r10d;" 390 + " movq %%r8, 64(%2);" 391 + " mulxq 40(%1), %%r10, %%r11;" 392 + " adox %%r9, %%r10;" 393 + " movq %%r10, 72(%2);" 394 + " mulxq 48(%1), %%rbx, %%r13;" 395 + " adox %%r11, %%rbx;" 396 + " mulxq 56(%1), %%r14, %%rdx;" 397 + " adox %%r13, %%r14;" 398 + " mov $0, %%rax;" 399 + " adox %%rdx, %%rax;" 400 + 283 401 /* Compute src1[1] * src2 */ 284 - " movq 40(%1), %%rdx;" 285 - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);" 286 - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);" 287 - " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" 288 - " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" 289 - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" 402 + " movq 40(%0), %%rdx;" 403 + " mulxq 32(%1), %%r8, %%r9;" 404 + " xor %%r10d, %%r10d;" 405 + " adcxq 72(%2), %%r8;" 406 + " movq %%r8, 72(%2);" 407 + " mulxq 40(%1), %%r10, %%r11;" 408 + " adox %%r9, %%r10;" 409 + " adcx %%rbx, %%r10;" 410 + " movq %%r10, 80(%2);" 411 + " mulxq 48(%1), %%rbx, %%r13;" 412 + " adox %%r11, %%rbx;" 413 + " adcx %%r14, %%rbx;" 414 + " mov $0, %%r8;" 415 + " mulxq 56(%1), %%r14, %%rdx;" 416 + " adox %%r13, %%r14;" 417 + " adcx %%rax, %%r14;" 418 + " mov $0, %%rax;" 419 + " adox %%rdx, %%rax;" 420 + " adcx %%r8, %%rax;" 421 + 290 422 /* Compute src1[2] * src2 */ 291 - " movq 48(%1), %%rdx;" 292 - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);" 293 - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);" 294 - " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" 295 - " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" 296 - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" 423 + " movq 48(%0), %%rdx;" 424 + " mulxq 32(%1), %%r8, %%r9;" 425 + " xor %%r10d, %%r10d;" 426 + " adcxq 80(%2), %%r8;" 427 + " movq %%r8, 80(%2);" 428 + " mulxq 40(%1), %%r10, %%r11;" 429 + " adox %%r9, %%r10;" 430 + " adcx %%rbx, %%r10;" 431 + " movq %%r10, 88(%2);" 432 + " mulxq 48(%1), %%rbx, %%r13;" 433 + " adox %%r11, %%rbx;" 434 + " adcx %%r14, %%rbx;" 435 + " mov $0, %%r8;" 436 + " mulxq 56(%1), %%r14, %%rdx;" 437 + " adox %%r13, %%r14;" 438 + " adcx %%rax, %%r14;" 439 + " mov $0, %%rax;" 440 + " adox %%rdx, %%rax;" 441 + " adcx %%r8, %%rax;" 442 + 297 443 /* Compute src1[3] * src2 */ 298 - " movq 56(%1), %%rdx;" 299 - " mulxq 32(%3), %%r8, %%r9;" " xor %%r10d, %%r10d;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);" 300 - " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);" 301 - " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;" 302 - " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;" 303 - " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);" 444 + " movq 56(%0), %%rdx;" 445 + " mulxq 32(%1), %%r8, %%r9;" 446 + " xor %%r10d, %%r10d;" 447 + " adcxq 88(%2), %%r8;" 448 + " movq %%r8, 88(%2);" 449 + " mulxq 40(%1), %%r10, %%r11;" 450 + " adox %%r9, %%r10;" 451 + " adcx %%rbx, %%r10;" 452 + " movq %%r10, 96(%2);" 453 + " mulxq 48(%1), %%rbx, %%r13;" 454 + " adox %%r11, %%rbx;" 455 + " adcx %%r14, %%rbx;" 456 + " movq %%rbx, 104(%2);" 457 + " mov $0, %%r8;" 458 + " mulxq 56(%1), %%r14, %%rdx;" 459 + " adox %%r13, %%r14;" 460 + " adcx %%rax, %%r14;" 461 + " movq %%r14, 112(%2);" 462 + " mov $0, %%rax;" 463 + " adox %%rdx, %%rax;" 464 + " adcx %%r8, %%rax;" 465 + " movq %%rax, 120(%2);" 466 + 304 467 /* Line up pointers */ 305 - " mov %0, %1;" 306 468 " mov %2, %0;" 469 + " mov %3, %2;" 307 470 308 471 /* Wrap the results back into the field */ 309 472 310 473 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ 311 474 " mov $38, %%rdx;" 312 - " mulxq 32(%1), %%r8, %%r13;" 313 - " xor %k3, %k3;" 314 - " adoxq 0(%1), %%r8;" 315 - " mulxq 40(%1), %%r9, %%rbx;" 475 + " mulxq 32(%0), %%r8, %%r13;" 476 + " xor %k1, %k1;" 477 + " adoxq 0(%0), %%r8;" 478 + " mulxq 40(%0), %%r9, %%rbx;" 316 479 " adcx %%r13, %%r9;" 317 - " adoxq 8(%1), %%r9;" 318 - " mulxq 48(%1), %%r10, %%r13;" 480 + " adoxq 8(%0), %%r9;" 481 + " mulxq 48(%0), %%r10, %%r13;" 319 482 " adcx %%rbx, %%r10;" 320 - " adoxq 16(%1), %%r10;" 321 - " mulxq 56(%1), %%r11, %%rax;" 483 + " adoxq 16(%0), %%r10;" 484 + " mulxq 56(%0), %%r11, %%rax;" 322 485 " adcx %%r13, %%r11;" 323 - " adoxq 24(%1), %%r11;" 324 - " adcx %3, %%rax;" 325 - " adox %3, %%rax;" 486 + " adoxq 24(%0), %%r11;" 487 + " adcx %1, %%rax;" 488 + " adox %1, %%rax;" 326 489 " imul %%rdx, %%rax;" 327 490 328 491 /* Step 2: Fold the carry back into dst */ 329 492 " add %%rax, %%r8;" 330 - " adcx %3, %%r9;" 331 - " movq %%r9, 8(%0);" 332 - " adcx %3, %%r10;" 333 - " movq %%r10, 16(%0);" 334 - " adcx %3, %%r11;" 335 - " movq %%r11, 24(%0);" 493 + " adcx %1, %%r9;" 494 + " movq %%r9, 8(%2);" 495 + " adcx %1, %%r10;" 496 + " movq %%r10, 16(%2);" 497 + " adcx %1, %%r11;" 498 + " movq %%r11, 24(%2);" 336 499 337 500 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ 338 501 " mov $0, %%rax;" 339 502 " cmovc %%rdx, %%rax;" 340 503 " add %%rax, %%r8;" 341 - " movq %%r8, 0(%0);" 504 + " movq %%r8, 0(%2);" 342 505 343 506 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ 344 507 " mov $38, %%rdx;" 345 - " mulxq 96(%1), %%r8, %%r13;" 346 - " xor %k3, %k3;" 347 - " adoxq 64(%1), %%r8;" 348 - " mulxq 104(%1), %%r9, %%rbx;" 508 + " mulxq 96(%0), %%r8, %%r13;" 509 + " xor %k1, %k1;" 510 + " adoxq 64(%0), %%r8;" 511 + " mulxq 104(%0), %%r9, %%rbx;" 349 512 " adcx %%r13, %%r9;" 350 - " adoxq 72(%1), %%r9;" 351 - " mulxq 112(%1), %%r10, %%r13;" 513 + " adoxq 72(%0), %%r9;" 514 + " mulxq 112(%0), %%r10, %%r13;" 352 515 " adcx %%rbx, %%r10;" 353 - " adoxq 80(%1), %%r10;" 354 - " mulxq 120(%1), %%r11, %%rax;" 516 + " adoxq 80(%0), %%r10;" 517 + " mulxq 120(%0), %%r11, %%rax;" 355 518 " adcx %%r13, %%r11;" 356 - " adoxq 88(%1), %%r11;" 357 - " adcx %3, %%rax;" 358 - " adox %3, %%rax;" 519 + " adoxq 88(%0), %%r11;" 520 + " adcx %1, %%rax;" 521 + " adox %1, %%rax;" 359 522 " imul %%rdx, %%rax;" 360 523 361 524 /* Step 2: Fold the carry back into dst */ 362 525 " add %%rax, %%r8;" 363 - " adcx %3, %%r9;" 364 - " movq %%r9, 40(%0);" 365 - " adcx %3, %%r10;" 366 - " movq %%r10, 48(%0);" 367 - " adcx %3, %%r11;" 368 - " movq %%r11, 56(%0);" 526 + " adcx %1, %%r9;" 527 + " movq %%r9, 40(%2);" 528 + " adcx %1, %%r10;" 529 + " movq %%r10, 48(%2);" 530 + " adcx %1, %%r11;" 531 + " movq %%r11, 56(%2);" 369 532 370 533 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ 371 534 " mov $0, %%rax;" 372 535 " cmovc %%rdx, %%rax;" 373 536 " add %%rax, %%r8;" 374 - " movq %%r8, 32(%0);" 375 - : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2) 376 - : 377 - : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "memory", "cc" 378 - ); 537 + " movq %%r8, 32(%2);" 538 + : "+&r"(f1), "+&r"(f2), "+&r"(tmp) 539 + : "r"(out) 540 + : "%rax", "%rbx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r13", 541 + "%r14", "memory", "cc"); 379 542 } 380 543 381 - /* Computes the field multiplication of four-element f1 with value in f2 */ 544 + /* Computes the field multiplication of four-element f1 with value in f2 545 + * Requires f2 to be smaller than 2^17 */ 382 546 static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2) 383 547 { 384 548 register u64 f2_r asm("rdx") = f2; 385 549 386 550 asm volatile( 387 551 /* Compute the raw multiplication of f1*f2 */ 388 - " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */ 389 - " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */ 552 + " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */ 553 + " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */ 390 554 " add %%rcx, %%r9;" 391 555 " mov $0, %%rcx;" 392 - " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */ 556 + " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */ 393 557 " adcx %%rbx, %%r10;" 394 - " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */ 558 + " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */ 395 559 " adcx %%r13, %%r11;" 396 560 " adcx %%rcx, %%rax;" 397 561 ··· 576 418 " cmovc %%rdx, %%rax;" 577 419 " add %%rax, %%r8;" 578 420 " movq %%r8, 0(%1);" 579 - : "+&r" (f2_r) 580 - : "r" (out), "r" (f1) 581 - : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "memory", "cc" 582 - ); 421 + : "+&r"(f2_r) 422 + : "r"(out), "r"(f1) 423 + : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r13", 424 + "memory", "cc"); 583 425 } 584 426 585 427 /* Computes p1 <- bit ? p2 : p1 in constant time */ 586 428 static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) 587 429 { 588 430 asm volatile( 589 - /* Invert the polarity of bit to match cmov expectations */ 431 + /* Transfer bit into CF flag */ 590 432 " add $18446744073709551615, %0;" 591 433 592 434 /* cswap p1[0], p2[0] */ ··· 660 502 " cmovc %%r10, %%r9;" 661 503 " movq %%r8, 56(%1);" 662 504 " movq %%r9, 56(%2);" 663 - : "+&r" (bit) 664 - : "r" (p1), "r" (p2) 665 - : "%r8", "%r9", "%r10", "memory", "cc" 666 - ); 505 + : "+&r"(bit) 506 + : "r"(p1), "r"(p2) 507 + : "%r8", "%r9", "%r10", "memory", "cc"); 667 508 } 668 509 669 510 /* Computes the square of a field element: out <- f * f ··· 673 516 /* Compute the raw multiplication: tmp <- f * f */ 674 517 675 518 /* Step 1: Compute all partial products */ 676 - " movq 0(%1), %%rdx;" /* f[0] */ 677 - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ 678 - " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ 679 - " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ 680 - " movq 24(%1), %%rdx;" /* f[3] */ 681 - " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ 682 - " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ 683 - " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ 684 - " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ 519 + " movq 0(%0), %%rdx;" /* f[0] */ 520 + " mulxq 8(%0), %%r8, %%r14;" 521 + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ 522 + " mulxq 16(%0), %%r9, %%r10;" 523 + " adcx %%r14, %%r9;" /* f[2]*f[0] */ 524 + " mulxq 24(%0), %%rax, %%rcx;" 525 + " adcx %%rax, %%r10;" /* f[3]*f[0] */ 526 + " movq 24(%0), %%rdx;" /* f[3] */ 527 + " mulxq 8(%0), %%r11, %%rbx;" 528 + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ 529 + " mulxq 16(%0), %%rax, %%r13;" 530 + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ 531 + " movq 8(%0), %%rdx;" 532 + " adcx %%r15, %%r13;" /* f1 */ 533 + " mulxq 16(%0), %%rax, %%rcx;" 534 + " mov $0, %%r14;" /* f[2]*f[1] */ 685 535 686 536 /* Step 2: Compute two parallel carry chains */ 687 537 " xor %%r15d, %%r15d;" ··· 706 542 " adcx %%r14, %%r14;" 707 543 708 544 /* Step 3: Compute intermediate squares */ 709 - " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ 710 - " movq %%rax, 0(%0);" 711 - " add %%rcx, %%r8;" " movq %%r8, 8(%0);" 712 - " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ 713 - " adcx %%rax, %%r9;" " movq %%r9, 16(%0);" 714 - " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);" 715 - " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ 716 - " adcx %%rax, %%r11;" " movq %%r11, 32(%0);" 717 - " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);" 718 - " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ 719 - " adcx %%rax, %%r13;" " movq %%r13, 48(%0);" 720 - " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);" 545 + " movq 0(%0), %%rdx;" 546 + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ 547 + " movq %%rax, 0(%1);" 548 + " add %%rcx, %%r8;" 549 + " movq %%r8, 8(%1);" 550 + " movq 8(%0), %%rdx;" 551 + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ 552 + " adcx %%rax, %%r9;" 553 + " movq %%r9, 16(%1);" 554 + " adcx %%rcx, %%r10;" 555 + " movq %%r10, 24(%1);" 556 + " movq 16(%0), %%rdx;" 557 + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ 558 + " adcx %%rax, %%r11;" 559 + " movq %%r11, 32(%1);" 560 + " adcx %%rcx, %%rbx;" 561 + " movq %%rbx, 40(%1);" 562 + " movq 24(%0), %%rdx;" 563 + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ 564 + " adcx %%rax, %%r13;" 565 + " movq %%r13, 48(%1);" 566 + " adcx %%rcx, %%r14;" 567 + " movq %%r14, 56(%1);" 721 568 722 569 /* Line up pointers */ 723 - " mov %0, %1;" 724 - " mov %2, %0;" 570 + " mov %1, %0;" 571 + " mov %2, %1;" 725 572 726 573 /* Wrap the result back into the field */ 727 574 728 575 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ 729 576 " mov $38, %%rdx;" 730 - " mulxq 32(%1), %%r8, %%r13;" 577 + " mulxq 32(%0), %%r8, %%r13;" 731 578 " xor %%ecx, %%ecx;" 732 - " adoxq 0(%1), %%r8;" 733 - " mulxq 40(%1), %%r9, %%rbx;" 579 + " adoxq 0(%0), %%r8;" 580 + " mulxq 40(%0), %%r9, %%rbx;" 734 581 " adcx %%r13, %%r9;" 735 - " adoxq 8(%1), %%r9;" 736 - " mulxq 48(%1), %%r10, %%r13;" 582 + " adoxq 8(%0), %%r9;" 583 + " mulxq 48(%0), %%r10, %%r13;" 737 584 " adcx %%rbx, %%r10;" 738 - " adoxq 16(%1), %%r10;" 739 - " mulxq 56(%1), %%r11, %%rax;" 585 + " adoxq 16(%0), %%r10;" 586 + " mulxq 56(%0), %%r11, %%rax;" 740 587 " adcx %%r13, %%r11;" 741 - " adoxq 24(%1), %%r11;" 588 + " adoxq 24(%0), %%r11;" 742 589 " adcx %%rcx, %%rax;" 743 590 " adox %%rcx, %%rax;" 744 591 " imul %%rdx, %%rax;" ··· 757 582 /* Step 2: Fold the carry back into dst */ 758 583 " add %%rax, %%r8;" 759 584 " adcx %%rcx, %%r9;" 760 - " movq %%r9, 8(%0);" 585 + " movq %%r9, 8(%1);" 761 586 " adcx %%rcx, %%r10;" 762 - " movq %%r10, 16(%0);" 587 + " movq %%r10, 16(%1);" 763 588 " adcx %%rcx, %%r11;" 764 - " movq %%r11, 24(%0);" 589 + " movq %%r11, 24(%1);" 765 590 766 591 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ 767 592 " mov $0, %%rax;" 768 593 " cmovc %%rdx, %%rax;" 769 594 " add %%rax, %%r8;" 770 - " movq %%r8, 0(%0);" 771 - : "+&r" (tmp), "+&r" (f), "+&r" (out) 772 - : 773 - : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc" 774 - ); 595 + " movq %%r8, 0(%1);" 596 + : "+&r"(f), "+&r"(tmp) 597 + : "r"(out) 598 + : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", 599 + "%r13", "%r14", "%r15", "memory", "cc"); 775 600 } 776 601 777 602 /* Computes two field squarings: 778 - * out[0] <- f[0] * f[0] 779 - * out[1] <- f[1] * f[1] 603 + * out[0] <- f[0] * f[0] 604 + * out[1] <- f[1] * f[1] 780 605 * Uses the 16-element buffer tmp for intermediate results */ 781 606 static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) 782 607 { 783 608 asm volatile( 784 609 /* Step 1: Compute all partial products */ 785 - " movq 0(%1), %%rdx;" /* f[0] */ 786 - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ 787 - " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ 788 - " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ 789 - " movq 24(%1), %%rdx;" /* f[3] */ 790 - " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ 791 - " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ 792 - " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ 793 - " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ 610 + " movq 0(%0), %%rdx;" /* f[0] */ 611 + " mulxq 8(%0), %%r8, %%r14;" 612 + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ 613 + " mulxq 16(%0), %%r9, %%r10;" 614 + " adcx %%r14, %%r9;" /* f[2]*f[0] */ 615 + " mulxq 24(%0), %%rax, %%rcx;" 616 + " adcx %%rax, %%r10;" /* f[3]*f[0] */ 617 + " movq 24(%0), %%rdx;" /* f[3] */ 618 + " mulxq 8(%0), %%r11, %%rbx;" 619 + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ 620 + " mulxq 16(%0), %%rax, %%r13;" 621 + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ 622 + " movq 8(%0), %%rdx;" 623 + " adcx %%r15, %%r13;" /* f1 */ 624 + " mulxq 16(%0), %%rax, %%rcx;" 625 + " mov $0, %%r14;" /* f[2]*f[1] */ 794 626 795 627 /* Step 2: Compute two parallel carry chains */ 796 628 " xor %%r15d, %%r15d;" ··· 815 633 " adcx %%r14, %%r14;" 816 634 817 635 /* Step 3: Compute intermediate squares */ 818 - " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ 819 - " movq %%rax, 0(%0);" 820 - " add %%rcx, %%r8;" " movq %%r8, 8(%0);" 821 - " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ 822 - " adcx %%rax, %%r9;" " movq %%r9, 16(%0);" 823 - " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);" 824 - " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ 825 - " adcx %%rax, %%r11;" " movq %%r11, 32(%0);" 826 - " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);" 827 - " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ 828 - " adcx %%rax, %%r13;" " movq %%r13, 48(%0);" 829 - " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);" 636 + " movq 0(%0), %%rdx;" 637 + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ 638 + " movq %%rax, 0(%1);" 639 + " add %%rcx, %%r8;" 640 + " movq %%r8, 8(%1);" 641 + " movq 8(%0), %%rdx;" 642 + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ 643 + " adcx %%rax, %%r9;" 644 + " movq %%r9, 16(%1);" 645 + " adcx %%rcx, %%r10;" 646 + " movq %%r10, 24(%1);" 647 + " movq 16(%0), %%rdx;" 648 + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ 649 + " adcx %%rax, %%r11;" 650 + " movq %%r11, 32(%1);" 651 + " adcx %%rcx, %%rbx;" 652 + " movq %%rbx, 40(%1);" 653 + " movq 24(%0), %%rdx;" 654 + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ 655 + " adcx %%rax, %%r13;" 656 + " movq %%r13, 48(%1);" 657 + " adcx %%rcx, %%r14;" 658 + " movq %%r14, 56(%1);" 830 659 831 660 /* Step 1: Compute all partial products */ 832 - " movq 32(%1), %%rdx;" /* f[0] */ 833 - " mulxq 40(%1), %%r8, %%r14;" " xor %%r15d, %%r15d;" /* f[1]*f[0] */ 834 - " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ 835 - " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ 836 - " movq 56(%1), %%rdx;" /* f[3] */ 837 - " mulxq 40(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ 838 - " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ 839 - " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ 840 - " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ 661 + " movq 32(%0), %%rdx;" /* f[0] */ 662 + " mulxq 40(%0), %%r8, %%r14;" 663 + " xor %%r15d, %%r15d;" /* f[1]*f[0] */ 664 + " mulxq 48(%0), %%r9, %%r10;" 665 + " adcx %%r14, %%r9;" /* f[2]*f[0] */ 666 + " mulxq 56(%0), %%rax, %%rcx;" 667 + " adcx %%rax, %%r10;" /* f[3]*f[0] */ 668 + " movq 56(%0), %%rdx;" /* f[3] */ 669 + " mulxq 40(%0), %%r11, %%rbx;" 670 + " adcx %%rcx, %%r11;" /* f[1]*f[3] */ 671 + " mulxq 48(%0), %%rax, %%r13;" 672 + " adcx %%rax, %%rbx;" /* f[2]*f[3] */ 673 + " movq 40(%0), %%rdx;" 674 + " adcx %%r15, %%r13;" /* f1 */ 675 + " mulxq 48(%0), %%rax, %%rcx;" 676 + " mov $0, %%r14;" /* f[2]*f[1] */ 841 677 842 678 /* Step 2: Compute two parallel carry chains */ 843 679 " xor %%r15d, %%r15d;" ··· 873 673 " adcx %%r14, %%r14;" 874 674 875 675 /* Step 3: Compute intermediate squares */ 876 - " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ 877 - " movq %%rax, 64(%0);" 878 - " add %%rcx, %%r8;" " movq %%r8, 72(%0);" 879 - " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ 880 - " adcx %%rax, %%r9;" " movq %%r9, 80(%0);" 881 - " adcx %%rcx, %%r10;" " movq %%r10, 88(%0);" 882 - " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ 883 - " adcx %%rax, %%r11;" " movq %%r11, 96(%0);" 884 - " adcx %%rcx, %%rbx;" " movq %%rbx, 104(%0);" 885 - " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ 886 - " adcx %%rax, %%r13;" " movq %%r13, 112(%0);" 887 - " adcx %%rcx, %%r14;" " movq %%r14, 120(%0);" 676 + " movq 32(%0), %%rdx;" 677 + " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ 678 + " movq %%rax, 64(%1);" 679 + " add %%rcx, %%r8;" 680 + " movq %%r8, 72(%1);" 681 + " movq 40(%0), %%rdx;" 682 + " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ 683 + " adcx %%rax, %%r9;" 684 + " movq %%r9, 80(%1);" 685 + " adcx %%rcx, %%r10;" 686 + " movq %%r10, 88(%1);" 687 + " movq 48(%0), %%rdx;" 688 + " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ 689 + " adcx %%rax, %%r11;" 690 + " movq %%r11, 96(%1);" 691 + " adcx %%rcx, %%rbx;" 692 + " movq %%rbx, 104(%1);" 693 + " movq 56(%0), %%rdx;" 694 + " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ 695 + " adcx %%rax, %%r13;" 696 + " movq %%r13, 112(%1);" 697 + " adcx %%rcx, %%r14;" 698 + " movq %%r14, 120(%1);" 888 699 889 700 /* Line up pointers */ 890 - " mov %0, %1;" 891 - " mov %2, %0;" 701 + " mov %1, %0;" 702 + " mov %2, %1;" 892 703 893 704 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ 894 705 " mov $38, %%rdx;" 895 - " mulxq 32(%1), %%r8, %%r13;" 706 + " mulxq 32(%0), %%r8, %%r13;" 896 707 " xor %%ecx, %%ecx;" 897 - " adoxq 0(%1), %%r8;" 898 - " mulxq 40(%1), %%r9, %%rbx;" 708 + " adoxq 0(%0), %%r8;" 709 + " mulxq 40(%0), %%r9, %%rbx;" 899 710 " adcx %%r13, %%r9;" 900 - " adoxq 8(%1), %%r9;" 901 - " mulxq 48(%1), %%r10, %%r13;" 711 + " adoxq 8(%0), %%r9;" 712 + " mulxq 48(%0), %%r10, %%r13;" 902 713 " adcx %%rbx, %%r10;" 903 - " adoxq 16(%1), %%r10;" 904 - " mulxq 56(%1), %%r11, %%rax;" 714 + " adoxq 16(%0), %%r10;" 715 + " mulxq 56(%0), %%r11, %%rax;" 905 716 " adcx %%r13, %%r11;" 906 - " adoxq 24(%1), %%r11;" 717 + " adoxq 24(%0), %%r11;" 907 718 " adcx %%rcx, %%rax;" 908 719 " adox %%rcx, %%rax;" 909 720 " imul %%rdx, %%rax;" ··· 922 711 /* Step 2: Fold the carry back into dst */ 923 712 " add %%rax, %%r8;" 924 713 " adcx %%rcx, %%r9;" 925 - " movq %%r9, 8(%0);" 714 + " movq %%r9, 8(%1);" 926 715 " adcx %%rcx, %%r10;" 927 - " movq %%r10, 16(%0);" 716 + " movq %%r10, 16(%1);" 928 717 " adcx %%rcx, %%r11;" 929 - " movq %%r11, 24(%0);" 718 + " movq %%r11, 24(%1);" 930 719 931 720 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ 932 721 " mov $0, %%rax;" 933 722 " cmovc %%rdx, %%rax;" 934 723 " add %%rax, %%r8;" 935 - " movq %%r8, 0(%0);" 724 + " movq %%r8, 0(%1);" 936 725 937 726 /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ 938 727 " mov $38, %%rdx;" 939 - " mulxq 96(%1), %%r8, %%r13;" 728 + " mulxq 96(%0), %%r8, %%r13;" 940 729 " xor %%ecx, %%ecx;" 941 - " adoxq 64(%1), %%r8;" 942 - " mulxq 104(%1), %%r9, %%rbx;" 730 + " adoxq 64(%0), %%r8;" 731 + " mulxq 104(%0), %%r9, %%rbx;" 943 732 " adcx %%r13, %%r9;" 944 - " adoxq 72(%1), %%r9;" 945 - " mulxq 112(%1), %%r10, %%r13;" 733 + " adoxq 72(%0), %%r9;" 734 + " mulxq 112(%0), %%r10, %%r13;" 946 735 " adcx %%rbx, %%r10;" 947 - " adoxq 80(%1), %%r10;" 948 - " mulxq 120(%1), %%r11, %%rax;" 736 + " adoxq 80(%0), %%r10;" 737 + " mulxq 120(%0), %%r11, %%rax;" 949 738 " adcx %%r13, %%r11;" 950 - " adoxq 88(%1), %%r11;" 739 + " adoxq 88(%0), %%r11;" 951 740 " adcx %%rcx, %%rax;" 952 741 " adox %%rcx, %%rax;" 953 742 " imul %%rdx, %%rax;" ··· 955 744 /* Step 2: Fold the carry back into dst */ 956 745 " add %%rax, %%r8;" 957 746 " adcx %%rcx, %%r9;" 958 - " movq %%r9, 40(%0);" 747 + " movq %%r9, 40(%1);" 959 748 " adcx %%rcx, %%r10;" 960 - " movq %%r10, 48(%0);" 749 + " movq %%r10, 48(%1);" 961 750 " adcx %%rcx, %%r11;" 962 - " movq %%r11, 56(%0);" 751 + " movq %%r11, 56(%1);" 963 752 964 753 /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ 965 754 " mov $0, %%rax;" 966 755 " cmovc %%rdx, %%rax;" 967 756 " add %%rax, %%r8;" 968 - " movq %%r8, 32(%0);" 969 - : "+&r" (tmp), "+&r" (f), "+&r" (out) 970 - : 971 - : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%rbx", "%r13", "%r14", "%r15", "memory", "cc" 972 - ); 757 + " movq %%r8, 32(%1);" 758 + : "+&r"(f), "+&r"(tmp) 759 + : "r"(out) 760 + : "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", 761 + "%r13", "%r14", "%r15", "memory", "cc"); 973 762 } 974 763 975 764 static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)