Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

crypto: powerpc - Add POWER8 optimised crc32c

Use the vector polynomial multiply-sum instructions in POWER8 to
speed up crc32c.

This is just over 41x faster than the slice-by-8 method that it
replaces. Measurements on a 4.1 GHz POWER8 show it sustaining
52 GiB/sec.

A simple btrfs write performance test:

dd if=/dev/zero of=/mnt/tmpfile bs=1M count=4096
sync

is over 3.7x faster.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

authored by

Anton Blanchard and committed by
Herbert Xu
6dd7a82c 151f2511

+1745
+2
arch/powerpc/crypto/Makefile
··· 9 9 obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o 10 10 obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o 11 11 obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o 12 + obj-$(CONFIG_CRYPT_CRC32C_VPMSUM) += crc32c-vpmsum.o 12 13 13 14 aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o 14 15 md5-ppc-y := md5-asm.o md5-glue.o 15 16 sha1-powerpc-y := sha1-powerpc-asm.o sha1.o 16 17 sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o 17 18 sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o 19 + crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
+1553
arch/powerpc/crypto/crc32c-vpmsum_asm.S
··· 1 + /* 2 + * Calculate the checksum of data that is 16 byte aligned and a multiple of 3 + * 16 bytes. 4 + * 5 + * The first step is to reduce it to 1024 bits. We do this in 8 parallel 6 + * chunks in order to mask the latency of the vpmsum instructions. If we 7 + * have more than 32 kB of data to checksum we repeat this step multiple 8 + * times, passing in the previous 1024 bits. 9 + * 10 + * The next step is to reduce the 1024 bits to 64 bits. This step adds 11 + * 32 bits of 0s to the end - this matches what a CRC does. We just 12 + * calculate constants that land the data in this 32 bits. 13 + * 14 + * We then use fixed point Barrett reduction to compute a mod n over GF(2) 15 + * for n = CRC using POWER8 instructions. We use x = 32. 16 + * 17 + * http://en.wikipedia.org/wiki/Barrett_reduction 18 + * 19 + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM 20 + * 21 + * This program is free software; you can redistribute it and/or 22 + * modify it under the terms of the GNU General Public License 23 + * as published by the Free Software Foundation; either version 24 + * 2 of the License, or (at your option) any later version. 25 + */ 26 + #include <asm/ppc_asm.h> 27 + #include <asm/ppc-opcode.h> 28 + 29 + .section .rodata 30 + .balign 16 31 + 32 + .byteswap_constant: 33 + /* byte reverse permute constant */ 34 + .octa 0x0F0E0D0C0B0A09080706050403020100 35 + 36 + #define MAX_SIZE 32768 37 + .constants: 38 + 39 + /* Reduce 262144 kbits to 1024 bits */ 40 + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ 41 + .octa 0x00000000b6ca9e20000000009c37c408 42 + 43 + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ 44 + .octa 0x00000000350249a800000001b51df26c 45 + 46 + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ 47 + .octa 0x00000001862dac54000000000724b9d0 48 + 49 + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ 50 + .octa 0x00000001d87fb48c00000001c00532fe 51 + 52 + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ 53 + .octa 0x00000001f39b699e00000000f05a9362 54 + 55 + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ 56 + .octa 0x0000000101da11b400000001e1007970 57 + 58 + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ 59 + .octa 0x00000001cab571e000000000a57366ee 60 + 61 + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ 62 + .octa 0x00000000c7020cfe0000000192011284 63 + 64 + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ 65 + .octa 0x00000000cdaed1ae0000000162716d9a 66 + 67 + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ 68 + .octa 0x00000001e804effc00000000cd97ecde 69 + 70 + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ 71 + .octa 0x0000000077c3ea3a0000000058812bc0 72 + 73 + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ 74 + .octa 0x0000000068df31b40000000088b8c12e 75 + 76 + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ 77 + .octa 0x00000000b059b6c200000001230b234c 78 + 79 + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ 80 + .octa 0x0000000145fb8ed800000001120b416e 81 + 82 + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ 83 + .octa 0x00000000cbc0916800000001974aecb0 84 + 85 + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ 86 + .octa 0x000000005ceeedc2000000008ee3f226 87 + 88 + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ 89 + .octa 0x0000000047d74e8600000001089aba9a 90 + 91 + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ 92 + .octa 0x00000001407e9e220000000065113872 93 + 94 + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ 95 + .octa 0x00000001da967bda000000005c07ec10 96 + 97 + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ 98 + .octa 0x000000006c8983680000000187590924 99 + 100 + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ 101 + .octa 0x00000000f2d14c9800000000e35da7c6 102 + 103 + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ 104 + .octa 0x00000001993c6ad4000000000415855a 105 + 106 + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ 107 + .octa 0x000000014683d1ac0000000073617758 108 + 109 + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ 110 + .octa 0x00000001a7c93e6c0000000176021d28 111 + 112 + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ 113 + .octa 0x000000010211e90a00000001c358fd0a 114 + 115 + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ 116 + .octa 0x000000001119403e00000001ff7a2c18 117 + 118 + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ 119 + .octa 0x000000001c3261aa00000000f2d9f7e4 120 + 121 + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ 122 + .octa 0x000000014e37a634000000016cf1f9c8 123 + 124 + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ 125 + .octa 0x0000000073786c0c000000010af9279a 126 + 127 + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ 128 + .octa 0x000000011dc037f80000000004f101e8 129 + 130 + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ 131 + .octa 0x0000000031433dfc0000000070bcf184 132 + 133 + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ 134 + .octa 0x000000009cde8348000000000a8de642 135 + 136 + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ 137 + .octa 0x0000000038d3c2a60000000062ea130c 138 + 139 + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ 140 + .octa 0x000000011b25f26000000001eb31cbb2 141 + 142 + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ 143 + .octa 0x000000001629e6f00000000170783448 144 + 145 + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ 146 + .octa 0x0000000160838b4c00000001a684b4c6 147 + 148 + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ 149 + .octa 0x000000007a44011c00000000253ca5b4 150 + 151 + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ 152 + .octa 0x00000000226f417a0000000057b4b1e2 153 + 154 + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ 155 + .octa 0x0000000045eb2eb400000000b6bd084c 156 + 157 + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ 158 + .octa 0x000000014459d70c0000000123c2d592 159 + 160 + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ 161 + .octa 0x00000001d406ed8200000000159dafce 162 + 163 + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ 164 + .octa 0x0000000160c8e1a80000000127e1a64e 165 + 166 + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ 167 + .octa 0x0000000027ba80980000000056860754 168 + 169 + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ 170 + .octa 0x000000006d92d01800000001e661aae8 171 + 172 + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ 173 + .octa 0x000000012ed7e3f200000000f82c6166 174 + 175 + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ 176 + .octa 0x000000002dc8778800000000c4f9c7ae 177 + 178 + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ 179 + .octa 0x0000000018240bb80000000074203d20 180 + 181 + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ 182 + .octa 0x000000001ad381580000000198173052 183 + 184 + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ 185 + .octa 0x00000001396b78f200000001ce8aba54 186 + 187 + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ 188 + .octa 0x000000011a68133400000001850d5d94 189 + 190 + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ 191 + .octa 0x000000012104732e00000001d609239c 192 + 193 + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ 194 + .octa 0x00000000a140d90c000000001595f048 195 + 196 + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ 197 + .octa 0x00000001b7215eda0000000042ccee08 198 + 199 + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ 200 + .octa 0x00000001aaf1df3c000000010a389d74 201 + 202 + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ 203 + .octa 0x0000000029d15b8a000000012a840da6 204 + 205 + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ 206 + .octa 0x00000000f1a96922000000001d181c0c 207 + 208 + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ 209 + .octa 0x00000001ac80d03c0000000068b7d1f6 210 + 211 + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ 212 + .octa 0x000000000f11d56a000000005b0f14fc 213 + 214 + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ 215 + .octa 0x00000001f1c022a20000000179e9e730 216 + 217 + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ 218 + .octa 0x0000000173d00ae200000001ce1368d6 219 + 220 + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ 221 + .octa 0x00000001d4ffe4ac0000000112c3a84c 222 + 223 + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ 224 + .octa 0x000000016edc5ae400000000de940fee 225 + 226 + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ 227 + .octa 0x00000001f1a0214000000000fe896b7e 228 + 229 + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ 230 + .octa 0x00000000ca0b28a000000001f797431c 231 + 232 + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ 233 + .octa 0x00000001928e30a20000000053e989ba 234 + 235 + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ 236 + .octa 0x0000000097b1b002000000003920cd16 237 + 238 + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ 239 + .octa 0x00000000b15bf90600000001e6f579b8 240 + 241 + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ 242 + .octa 0x00000000411c5d52000000007493cb0a 243 + 244 + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ 245 + .octa 0x00000001c36f330000000001bdd376d8 246 + 247 + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ 248 + .octa 0x00000001119227e0000000016badfee6 249 + 250 + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ 251 + .octa 0x00000000114d47020000000071de5c58 252 + 253 + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ 254 + .octa 0x00000000458b5b9800000000453f317c 255 + 256 + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ 257 + .octa 0x000000012e31fb8e0000000121675cce 258 + 259 + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ 260 + .octa 0x000000005cf619d800000001f409ee92 261 + 262 + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ 263 + .octa 0x0000000063f4d8b200000000f36b9c88 264 + 265 + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ 266 + .octa 0x000000004138dc8a0000000036b398f4 267 + 268 + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ 269 + .octa 0x00000001d29ee8e000000001748f9adc 270 + 271 + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ 272 + .octa 0x000000006a08ace800000001be94ec00 273 + 274 + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ 275 + .octa 0x0000000127d4201000000000b74370d6 276 + 277 + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ 278 + .octa 0x0000000019d76b6200000001174d0b98 279 + 280 + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ 281 + .octa 0x00000001b1471f6e00000000befc06a4 282 + 283 + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ 284 + .octa 0x00000001f64c19cc00000001ae125288 285 + 286 + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ 287 + .octa 0x00000000003c0ea00000000095c19b34 288 + 289 + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ 290 + .octa 0x000000014d73abf600000001a78496f2 291 + 292 + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ 293 + .octa 0x00000001620eb84400000001ac5390a0 294 + 295 + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ 296 + .octa 0x0000000147655048000000002a80ed6e 297 + 298 + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ 299 + .octa 0x0000000067b5077e00000001fa9b0128 300 + 301 + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ 302 + .octa 0x0000000010ffe20600000001ea94929e 303 + 304 + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ 305 + .octa 0x000000000fee8f1e0000000125f4305c 306 + 307 + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ 308 + .octa 0x00000001da26fbae00000001471e2002 309 + 310 + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ 311 + .octa 0x00000001b3a8bd880000000132d2253a 312 + 313 + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ 314 + .octa 0x00000000e8f3898e00000000f26b3592 315 + 316 + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ 317 + .octa 0x00000000b0d0d28c00000000bc8b67b0 318 + 319 + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ 320 + .octa 0x0000000030f2a798000000013a826ef2 321 + 322 + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ 323 + .octa 0x000000000fba10020000000081482c84 324 + 325 + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ 326 + .octa 0x00000000bdb9bd7200000000e77307c2 327 + 328 + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ 329 + .octa 0x0000000075d3bf5a00000000d4a07ec8 330 + 331 + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ 332 + .octa 0x00000000ef1f98a00000000017102100 333 + 334 + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ 335 + .octa 0x00000000689c760200000000db406486 336 + 337 + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ 338 + .octa 0x000000016d5fa5fe0000000192db7f88 339 + 340 + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ 341 + .octa 0x00000001d0d2b9ca000000018bf67b1e 342 + 343 + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ 344 + .octa 0x0000000041e7b470000000007c09163e 345 + 346 + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ 347 + .octa 0x00000001cbb6495e000000000adac060 348 + 349 + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ 350 + .octa 0x000000010052a0b000000000bd8316ae 351 + 352 + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ 353 + .octa 0x00000001d8effb5c000000019f09ab54 354 + 355 + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ 356 + .octa 0x00000001d969853c0000000125155542 357 + 358 + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ 359 + .octa 0x00000000523ccce2000000018fdb5882 360 + 361 + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ 362 + .octa 0x000000001e2436bc00000000e794b3f4 363 + 364 + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ 365 + .octa 0x00000000ddd1c3a2000000016f9bb022 366 + 367 + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ 368 + .octa 0x0000000019fcfe3800000000290c9978 369 + 370 + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ 371 + .octa 0x00000001ce95db640000000083c0f350 372 + 373 + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ 374 + .octa 0x00000000af5828060000000173ea6628 375 + 376 + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ 377 + .octa 0x00000001006388f600000001c8b4e00a 378 + 379 + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ 380 + .octa 0x0000000179eca00a00000000de95d6aa 381 + 382 + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ 383 + .octa 0x0000000122410a6a000000010b7f7248 384 + 385 + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ 386 + .octa 0x000000004288e87c00000001326e3a06 387 + 388 + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ 389 + .octa 0x000000016c5490da00000000bb62c2e6 390 + 391 + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ 392 + .octa 0x00000000d1c71f6e0000000156a4b2c2 393 + 394 + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ 395 + .octa 0x00000001b4ce08a6000000011dfe763a 396 + 397 + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ 398 + .octa 0x00000001466ba60c000000007bcca8e2 399 + 400 + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ 401 + .octa 0x00000001f6c488a40000000186118faa 402 + 403 + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ 404 + .octa 0x000000013bfb06820000000111a65a88 405 + 406 + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ 407 + .octa 0x00000000690e9e54000000003565e1c4 408 + 409 + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ 410 + .octa 0x00000000281346b6000000012ed02a82 411 + 412 + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ 413 + .octa 0x000000015646402400000000c486ecfc 414 + 415 + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ 416 + .octa 0x000000016063a8dc0000000001b951b2 417 + 418 + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ 419 + .octa 0x0000000116a663620000000048143916 420 + 421 + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ 422 + .octa 0x000000017e8aa4d200000001dc2ae124 423 + 424 + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ 425 + .octa 0x00000001728eb10c00000001416c58d6 426 + 427 + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ 428 + .octa 0x00000001b08fd7fa00000000a479744a 429 + 430 + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ 431 + .octa 0x00000001092a16e80000000096ca3a26 432 + 433 + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ 434 + .octa 0x00000000a505637c00000000ff223d4e 435 + 436 + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ 437 + .octa 0x00000000d94869b2000000010e84da42 438 + 439 + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ 440 + .octa 0x00000001c8b203ae00000001b61ba3d0 441 + 442 + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ 443 + .octa 0x000000005704aea000000000680f2de8 444 + 445 + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ 446 + .octa 0x000000012e295fa2000000008772a9a8 447 + 448 + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ 449 + .octa 0x000000011d0908bc0000000155f295bc 450 + 451 + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ 452 + .octa 0x0000000193ed97ea00000000595f9282 453 + 454 + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ 455 + .octa 0x000000013a0f1c520000000164b1c25a 456 + 457 + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ 458 + .octa 0x000000010c2c40c000000000fbd67c50 459 + 460 + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ 461 + .octa 0x00000000ff6fac3e0000000096076268 462 + 463 + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ 464 + .octa 0x000000017b3609c000000001d288e4cc 465 + 466 + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ 467 + .octa 0x0000000088c8c92200000001eaac1bdc 468 + 469 + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ 470 + .octa 0x00000001751baae600000001f1ea39e2 471 + 472 + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ 473 + .octa 0x000000010795297200000001eb6506fc 474 + 475 + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ 476 + .octa 0x0000000162b00abe000000010f806ffe 477 + 478 + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ 479 + .octa 0x000000000d7b404c000000010408481e 480 + 481 + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ 482 + .octa 0x00000000763b13d40000000188260534 483 + 484 + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ 485 + .octa 0x00000000f6dc22d80000000058fc73e0 486 + 487 + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ 488 + .octa 0x000000007daae06000000000391c59b8 489 + 490 + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ 491 + .octa 0x000000013359ab7c000000018b638400 492 + 493 + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ 494 + .octa 0x000000008add438a000000011738f5c4 495 + 496 + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ 497 + .octa 0x00000001edbefdea000000008cf7c6da 498 + 499 + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ 500 + .octa 0x000000004104e0f800000001ef97fb16 501 + 502 + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ 503 + .octa 0x00000000b48a82220000000102130e20 504 + 505 + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ 506 + .octa 0x00000001bcb4684400000000db968898 507 + 508 + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ 509 + .octa 0x000000013293ce0a00000000b5047b5e 510 + 511 + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ 512 + .octa 0x00000001710d0844000000010b90fdb2 513 + 514 + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ 515 + .octa 0x0000000117907f6e000000004834a32e 516 + 517 + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ 518 + .octa 0x0000000087ddf93e0000000059c8f2b0 519 + 520 + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ 521 + .octa 0x000000005970e9b00000000122cec508 522 + 523 + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ 524 + .octa 0x0000000185b2b7d0000000000a330cda 525 + 526 + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ 527 + .octa 0x00000001dcee0efc000000014a47148c 528 + 529 + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ 530 + .octa 0x0000000030da27220000000042c61cb8 531 + 532 + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ 533 + .octa 0x000000012f925a180000000012fe6960 534 + 535 + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ 536 + .octa 0x00000000dd2e357c00000000dbda2c20 537 + 538 + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ 539 + .octa 0x00000000071c80de000000011122410c 540 + 541 + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ 542 + .octa 0x000000011513140a00000000977b2070 543 + 544 + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ 545 + .octa 0x00000001df876e8e000000014050438e 546 + 547 + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ 548 + .octa 0x000000015f81d6ce0000000147c840e8 549 + 550 + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ 551 + .octa 0x000000019dd94dbe00000001cc7c88ce 552 + 553 + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ 554 + .octa 0x00000001373d206e00000001476b35a4 555 + 556 + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ 557 + .octa 0x00000000668ccade000000013d52d508 558 + 559 + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ 560 + .octa 0x00000001b192d268000000008e4be32e 561 + 562 + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ 563 + .octa 0x00000000e30f3a7800000000024120fe 564 + 565 + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ 566 + .octa 0x000000010ef1f7bc00000000ddecddb4 567 + 568 + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ 569 + .octa 0x00000001f5ac738000000000d4d403bc 570 + 571 + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ 572 + .octa 0x000000011822ea7000000001734b89aa 573 + 574 + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ 575 + .octa 0x00000000c3a33848000000010e7a58d6 576 + 577 + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ 578 + .octa 0x00000001bd151c2400000001f9f04e9c 579 + 580 + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ 581 + .octa 0x0000000056002d7600000000b692225e 582 + 583 + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ 584 + .octa 0x000000014657c4f4000000019b8d3f3e 585 + 586 + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ 587 + .octa 0x0000000113742d7c00000001a874f11e 588 + 589 + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ 590 + .octa 0x000000019c5920ba000000010d5a4254 591 + 592 + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ 593 + .octa 0x000000005216d2d600000000bbb2f5d6 594 + 595 + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ 596 + .octa 0x0000000136f5ad8a0000000179cc0e36 597 + 598 + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ 599 + .octa 0x000000018b07beb600000001dca1da4a 600 + 601 + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ 602 + .octa 0x00000000db1e93b000000000feb1a192 603 + 604 + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ 605 + .octa 0x000000000b96fa3a00000000d1eeedd6 606 + 607 + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ 608 + .octa 0x00000001d9968af0000000008fad9bb4 609 + 610 + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ 611 + .octa 0x000000000e4a77a200000001884938e4 612 + 613 + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ 614 + .octa 0x00000000508c2ac800000001bc2e9bc0 615 + 616 + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ 617 + .octa 0x0000000021572a8000000001f9658a68 618 + 619 + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ 620 + .octa 0x00000001b859daf2000000001b9224fc 621 + 622 + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ 623 + .octa 0x000000016f7884740000000055b2fb84 624 + 625 + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ 626 + .octa 0x00000001b438810e000000018b090348 627 + 628 + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ 629 + .octa 0x0000000095ddc6f2000000011ccbd5ea 630 + 631 + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ 632 + .octa 0x00000001d977c20c0000000007ae47f8 633 + 634 + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ 635 + .octa 0x00000000ebedb99a0000000172acbec0 636 + 637 + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ 638 + .octa 0x00000001df9e9e9200000001c6e3ff20 639 + 640 + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ 641 + .octa 0x00000001a4a3f95200000000e1b38744 642 + 643 + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ 644 + .octa 0x00000000e2f5122000000000791585b2 645 + 646 + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ 647 + .octa 0x000000004aa01f3e00000000ac53b894 648 + 649 + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ 650 + .octa 0x00000000b3e90a5800000001ed5f2cf4 651 + 652 + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ 653 + .octa 0x000000000c9ca2aa00000001df48b2e0 654 + 655 + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ 656 + .octa 0x000000015168231600000000049c1c62 657 + 658 + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ 659 + .octa 0x0000000036fce78c000000017c460c12 660 + 661 + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ 662 + .octa 0x000000009037dc10000000015be4da7e 663 + 664 + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ 665 + .octa 0x00000000d3298582000000010f38f668 666 + 667 + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ 668 + .octa 0x00000001b42e8ad60000000039f40a00 669 + 670 + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ 671 + .octa 0x00000000142a983800000000bd4c10c4 672 + 673 + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ 674 + .octa 0x0000000109c7f1900000000042db1d98 675 + 676 + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ 677 + .octa 0x0000000056ff931000000001c905bae6 678 + 679 + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ 680 + .octa 0x00000001594513aa00000000069d40ea 681 + 682 + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ 683 + .octa 0x00000001e3b5b1e8000000008e4fbad0 684 + 685 + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ 686 + .octa 0x000000011dd5fc080000000047bedd46 687 + 688 + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ 689 + .octa 0x00000001675f0cc20000000026396bf8 690 + 691 + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ 692 + .octa 0x00000000d1c8dd4400000000379beb92 693 + 694 + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ 695 + .octa 0x0000000115ebd3d8000000000abae54a 696 + 697 + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ 698 + .octa 0x00000001ecbd0dac0000000007e6a128 699 + 700 + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ 701 + .octa 0x00000000cdf67af2000000000ade29d2 702 + 703 + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ 704 + .octa 0x000000004c01ff4c00000000f974c45c 705 + 706 + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ 707 + .octa 0x00000000f2d8657e00000000e77ac60a 708 + 709 + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ 710 + .octa 0x000000006bae74c40000000145895816 711 + 712 + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ 713 + .octa 0x0000000152af8aa00000000038e362be 714 + 715 + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ 716 + .octa 0x0000000004663802000000007f991a64 717 + 718 + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ 719 + .octa 0x00000001ab2f5afc00000000fa366d3a 720 + 721 + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ 722 + .octa 0x0000000074a4ebd400000001a2bb34f0 723 + 724 + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ 725 + .octa 0x00000001d7ab3a4c0000000028a9981e 726 + 727 + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ 728 + .octa 0x00000001a8da60c600000001dbc672be 729 + 730 + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ 731 + .octa 0x000000013cf6382000000000b04d77f6 732 + 733 + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ 734 + .octa 0x00000000bec12e1e0000000124400d96 735 + 736 + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ 737 + .octa 0x00000001c6368010000000014ca4b414 738 + 739 + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ 740 + .octa 0x00000001e6e78758000000012fe2c938 741 + 742 + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ 743 + .octa 0x000000008d7f2b3c00000001faed01e6 744 + 745 + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ 746 + .octa 0x000000016b4a156e000000007e80ecfe 747 + 748 + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ 749 + .octa 0x00000001c63cfeb60000000098daee94 750 + 751 + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ 752 + .octa 0x000000015f902670000000010a04edea 753 + 754 + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ 755 + .octa 0x00000001cd5de11e00000001c00b4524 756 + 757 + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ 758 + .octa 0x000000001acaec540000000170296550 759 + 760 + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ 761 + .octa 0x000000002bd0ca780000000181afaa48 762 + 763 + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ 764 + .octa 0x0000000032d63d5c0000000185a31ffa 765 + 766 + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ 767 + .octa 0x000000001c6d4e4c000000002469f608 768 + 769 + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ 770 + .octa 0x0000000106a60b92000000006980102a 771 + 772 + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ 773 + .octa 0x00000000d3855e120000000111ea9ca8 774 + 775 + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ 776 + .octa 0x00000000e312563600000001bd1d29ce 777 + 778 + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ 779 + .octa 0x000000009e8f7ea400000001b34b9580 780 + 781 + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ 782 + .octa 0x00000001c82e562c000000003076054e 783 + 784 + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ 785 + .octa 0x00000000ca9f09ce000000012a608ea4 786 + 787 + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ 788 + .octa 0x00000000c63764e600000000784d05fe 789 + 790 + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ 791 + .octa 0x0000000168d2e49e000000016ef0d82a 792 + 793 + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ 794 + .octa 0x00000000e986c1480000000075bda454 795 + 796 + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ 797 + .octa 0x00000000cfb65894000000003dc0a1c4 798 + 799 + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ 800 + .octa 0x0000000111cadee400000000e9a5d8be 801 + 802 + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ 803 + .octa 0x0000000171fb63ce00000001609bc4b4 804 + 805 + .short_constants: 806 + 807 + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ 808 + /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */ 809 + .octa 0x7fec2963e5bf80485cf015c388e56f72 810 + 811 + /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */ 812 + .octa 0x38e888d4844752a9963a18920246e2e6 813 + 814 + /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */ 815 + .octa 0x42316c00730206ad419a441956993a31 816 + 817 + /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */ 818 + .octa 0x543d5c543e65ddf9924752ba2b830011 819 + 820 + /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */ 821 + .octa 0x78e87aaf56767c9255bd7f9518e4a304 822 + 823 + /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */ 824 + .octa 0x8f68fcec1903da7f6d76739fe0553f1e 825 + 826 + /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */ 827 + .octa 0x3f4840246791d588c133722b1fe0b5c3 828 + 829 + /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */ 830 + .octa 0x34c96751b04de25a64b67ee0e55ef1f3 831 + 832 + /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */ 833 + .octa 0x156c8e180b4a395b069db049b8fdb1e7 834 + 835 + /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ 836 + .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e 837 + 838 + /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ 839 + .octa 0x041d37768cd75659817cdc5119b29a35 840 + 841 + /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ 842 + .octa 0x3a0777818cfaa9651ce9d94b36c41f1c 843 + 844 + /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ 845 + .octa 0x0e148e8252377a554f256efcb82be955 846 + 847 + /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ 848 + .octa 0x9c25531d19e65ddeec1631edb2dea967 849 + 850 + /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ 851 + .octa 0x790606ff9957c0a65d27e147510ac59a 852 + 853 + /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ 854 + .octa 0x82f63b786ea2d55ca66805eb18b8ea18 855 + 856 + 857 + .barrett_constants: 858 + /* 33 bit reflected Barrett constant m - (4^32)/n */ 859 + .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ 860 + /* 33 bit reflected Barrett constant n */ 861 + .octa 0x00000000000000000000000105ec76f1 862 + 863 + .text 864 + 865 + #if defined(__BIG_ENDIAN__) 866 + #define BYTESWAP_DATA 867 + #else 868 + #undef BYTESWAP_DATA 869 + #endif 870 + 871 + #define off16 r25 872 + #define off32 r26 873 + #define off48 r27 874 + #define off64 r28 875 + #define off80 r29 876 + #define off96 r30 877 + #define off112 r31 878 + 879 + #define const1 v24 880 + #define const2 v25 881 + 882 + #define byteswap v26 883 + #define mask_32bit v27 884 + #define mask_64bit v28 885 + #define zeroes v29 886 + 887 + #ifdef BYTESWAP_DATA 888 + #define VPERM(A, B, C, D) vperm A, B, C, D 889 + #else 890 + #define VPERM(A, B, C, D) 891 + #endif 892 + 893 + /* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */ 894 + FUNC_START(__crc32c_vpmsum) 895 + std r31,-8(r1) 896 + std r30,-16(r1) 897 + std r29,-24(r1) 898 + std r28,-32(r1) 899 + std r27,-40(r1) 900 + std r26,-48(r1) 901 + std r25,-56(r1) 902 + 903 + li off16,16 904 + li off32,32 905 + li off48,48 906 + li off64,64 907 + li off80,80 908 + li off96,96 909 + li off112,112 910 + li r0,0 911 + 912 + /* Enough room for saving 10 non volatile VMX registers */ 913 + subi r6,r1,56+10*16 914 + subi r7,r1,56+2*16 915 + 916 + stvx v20,0,r6 917 + stvx v21,off16,r6 918 + stvx v22,off32,r6 919 + stvx v23,off48,r6 920 + stvx v24,off64,r6 921 + stvx v25,off80,r6 922 + stvx v26,off96,r6 923 + stvx v27,off112,r6 924 + stvx v28,0,r7 925 + stvx v29,off16,r7 926 + 927 + mr r10,r3 928 + 929 + vxor zeroes,zeroes,zeroes 930 + vspltisw v0,-1 931 + 932 + vsldoi mask_32bit,zeroes,v0,4 933 + vsldoi mask_64bit,zeroes,v0,8 934 + 935 + /* Get the initial value into v8 */ 936 + vxor v8,v8,v8 937 + MTVRD(v8, R3) 938 + vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ 939 + 940 + #ifdef BYTESWAP_DATA 941 + addis r3,r2,.byteswap_constant@toc@ha 942 + addi r3,r3,.byteswap_constant@toc@l 943 + 944 + lvx byteswap,0,r3 945 + addi r3,r3,16 946 + #endif 947 + 948 + cmpdi r5,256 949 + blt .Lshort 950 + 951 + rldicr r6,r5,0,56 952 + 953 + /* Checksum in blocks of MAX_SIZE */ 954 + 1: lis r7,MAX_SIZE@h 955 + ori r7,r7,MAX_SIZE@l 956 + mr r9,r7 957 + cmpd r6,r7 958 + bgt 2f 959 + mr r7,r6 960 + 2: subf r6,r7,r6 961 + 962 + /* our main loop does 128 bytes at a time */ 963 + srdi r7,r7,7 964 + 965 + /* 966 + * Work out the offset into the constants table to start at. Each 967 + * constant is 16 bytes, and it is used against 128 bytes of input 968 + * data - 128 / 16 = 8 969 + */ 970 + sldi r8,r7,4 971 + srdi r9,r9,3 972 + subf r8,r8,r9 973 + 974 + /* We reduce our final 128 bytes in a separate step */ 975 + addi r7,r7,-1 976 + mtctr r7 977 + 978 + addis r3,r2,.constants@toc@ha 979 + addi r3,r3,.constants@toc@l 980 + 981 + /* Find the start of our constants */ 982 + add r3,r3,r8 983 + 984 + /* zero v0-v7 which will contain our checksums */ 985 + vxor v0,v0,v0 986 + vxor v1,v1,v1 987 + vxor v2,v2,v2 988 + vxor v3,v3,v3 989 + vxor v4,v4,v4 990 + vxor v5,v5,v5 991 + vxor v6,v6,v6 992 + vxor v7,v7,v7 993 + 994 + lvx const1,0,r3 995 + 996 + /* 997 + * If we are looping back to consume more data we use the values 998 + * already in v16-v23. 999 + */ 1000 + cmpdi r0,1 1001 + beq 2f 1002 + 1003 + /* First warm up pass */ 1004 + lvx v16,0,r4 1005 + lvx v17,off16,r4 1006 + VPERM(v16,v16,v16,byteswap) 1007 + VPERM(v17,v17,v17,byteswap) 1008 + lvx v18,off32,r4 1009 + lvx v19,off48,r4 1010 + VPERM(v18,v18,v18,byteswap) 1011 + VPERM(v19,v19,v19,byteswap) 1012 + lvx v20,off64,r4 1013 + lvx v21,off80,r4 1014 + VPERM(v20,v20,v20,byteswap) 1015 + VPERM(v21,v21,v21,byteswap) 1016 + lvx v22,off96,r4 1017 + lvx v23,off112,r4 1018 + VPERM(v22,v22,v22,byteswap) 1019 + VPERM(v23,v23,v23,byteswap) 1020 + addi r4,r4,8*16 1021 + 1022 + /* xor in initial value */ 1023 + vxor v16,v16,v8 1024 + 1025 + 2: bdz .Lfirst_warm_up_done 1026 + 1027 + addi r3,r3,16 1028 + lvx const2,0,r3 1029 + 1030 + /* Second warm up pass */ 1031 + VPMSUMD(v8,v16,const1) 1032 + lvx v16,0,r4 1033 + VPERM(v16,v16,v16,byteswap) 1034 + ori r2,r2,0 1035 + 1036 + VPMSUMD(v9,v17,const1) 1037 + lvx v17,off16,r4 1038 + VPERM(v17,v17,v17,byteswap) 1039 + ori r2,r2,0 1040 + 1041 + VPMSUMD(v10,v18,const1) 1042 + lvx v18,off32,r4 1043 + VPERM(v18,v18,v18,byteswap) 1044 + ori r2,r2,0 1045 + 1046 + VPMSUMD(v11,v19,const1) 1047 + lvx v19,off48,r4 1048 + VPERM(v19,v19,v19,byteswap) 1049 + ori r2,r2,0 1050 + 1051 + VPMSUMD(v12,v20,const1) 1052 + lvx v20,off64,r4 1053 + VPERM(v20,v20,v20,byteswap) 1054 + ori r2,r2,0 1055 + 1056 + VPMSUMD(v13,v21,const1) 1057 + lvx v21,off80,r4 1058 + VPERM(v21,v21,v21,byteswap) 1059 + ori r2,r2,0 1060 + 1061 + VPMSUMD(v14,v22,const1) 1062 + lvx v22,off96,r4 1063 + VPERM(v22,v22,v22,byteswap) 1064 + ori r2,r2,0 1065 + 1066 + VPMSUMD(v15,v23,const1) 1067 + lvx v23,off112,r4 1068 + VPERM(v23,v23,v23,byteswap) 1069 + 1070 + addi r4,r4,8*16 1071 + 1072 + bdz .Lfirst_cool_down 1073 + 1074 + /* 1075 + * main loop. We modulo schedule it such that it takes three iterations 1076 + * to complete - first iteration load, second iteration vpmsum, third 1077 + * iteration xor. 1078 + */ 1079 + .balign 16 1080 + 4: lvx const1,0,r3 1081 + addi r3,r3,16 1082 + ori r2,r2,0 1083 + 1084 + vxor v0,v0,v8 1085 + VPMSUMD(v8,v16,const2) 1086 + lvx v16,0,r4 1087 + VPERM(v16,v16,v16,byteswap) 1088 + ori r2,r2,0 1089 + 1090 + vxor v1,v1,v9 1091 + VPMSUMD(v9,v17,const2) 1092 + lvx v17,off16,r4 1093 + VPERM(v17,v17,v17,byteswap) 1094 + ori r2,r2,0 1095 + 1096 + vxor v2,v2,v10 1097 + VPMSUMD(v10,v18,const2) 1098 + lvx v18,off32,r4 1099 + VPERM(v18,v18,v18,byteswap) 1100 + ori r2,r2,0 1101 + 1102 + vxor v3,v3,v11 1103 + VPMSUMD(v11,v19,const2) 1104 + lvx v19,off48,r4 1105 + VPERM(v19,v19,v19,byteswap) 1106 + lvx const2,0,r3 1107 + ori r2,r2,0 1108 + 1109 + vxor v4,v4,v12 1110 + VPMSUMD(v12,v20,const1) 1111 + lvx v20,off64,r4 1112 + VPERM(v20,v20,v20,byteswap) 1113 + ori r2,r2,0 1114 + 1115 + vxor v5,v5,v13 1116 + VPMSUMD(v13,v21,const1) 1117 + lvx v21,off80,r4 1118 + VPERM(v21,v21,v21,byteswap) 1119 + ori r2,r2,0 1120 + 1121 + vxor v6,v6,v14 1122 + VPMSUMD(v14,v22,const1) 1123 + lvx v22,off96,r4 1124 + VPERM(v22,v22,v22,byteswap) 1125 + ori r2,r2,0 1126 + 1127 + vxor v7,v7,v15 1128 + VPMSUMD(v15,v23,const1) 1129 + lvx v23,off112,r4 1130 + VPERM(v23,v23,v23,byteswap) 1131 + 1132 + addi r4,r4,8*16 1133 + 1134 + bdnz 4b 1135 + 1136 + .Lfirst_cool_down: 1137 + /* First cool down pass */ 1138 + lvx const1,0,r3 1139 + addi r3,r3,16 1140 + 1141 + vxor v0,v0,v8 1142 + VPMSUMD(v8,v16,const1) 1143 + ori r2,r2,0 1144 + 1145 + vxor v1,v1,v9 1146 + VPMSUMD(v9,v17,const1) 1147 + ori r2,r2,0 1148 + 1149 + vxor v2,v2,v10 1150 + VPMSUMD(v10,v18,const1) 1151 + ori r2,r2,0 1152 + 1153 + vxor v3,v3,v11 1154 + VPMSUMD(v11,v19,const1) 1155 + ori r2,r2,0 1156 + 1157 + vxor v4,v4,v12 1158 + VPMSUMD(v12,v20,const1) 1159 + ori r2,r2,0 1160 + 1161 + vxor v5,v5,v13 1162 + VPMSUMD(v13,v21,const1) 1163 + ori r2,r2,0 1164 + 1165 + vxor v6,v6,v14 1166 + VPMSUMD(v14,v22,const1) 1167 + ori r2,r2,0 1168 + 1169 + vxor v7,v7,v15 1170 + VPMSUMD(v15,v23,const1) 1171 + ori r2,r2,0 1172 + 1173 + .Lsecond_cool_down: 1174 + /* Second cool down pass */ 1175 + vxor v0,v0,v8 1176 + vxor v1,v1,v9 1177 + vxor v2,v2,v10 1178 + vxor v3,v3,v11 1179 + vxor v4,v4,v12 1180 + vxor v5,v5,v13 1181 + vxor v6,v6,v14 1182 + vxor v7,v7,v15 1183 + 1184 + /* 1185 + * vpmsumd produces a 96 bit result in the least significant bits 1186 + * of the register. Since we are bit reflected we have to shift it 1187 + * left 32 bits so it occupies the least significant bits in the 1188 + * bit reflected domain. 1189 + */ 1190 + vsldoi v0,v0,zeroes,4 1191 + vsldoi v1,v1,zeroes,4 1192 + vsldoi v2,v2,zeroes,4 1193 + vsldoi v3,v3,zeroes,4 1194 + vsldoi v4,v4,zeroes,4 1195 + vsldoi v5,v5,zeroes,4 1196 + vsldoi v6,v6,zeroes,4 1197 + vsldoi v7,v7,zeroes,4 1198 + 1199 + /* xor with last 1024 bits */ 1200 + lvx v8,0,r4 1201 + lvx v9,off16,r4 1202 + VPERM(v8,v8,v8,byteswap) 1203 + VPERM(v9,v9,v9,byteswap) 1204 + lvx v10,off32,r4 1205 + lvx v11,off48,r4 1206 + VPERM(v10,v10,v10,byteswap) 1207 + VPERM(v11,v11,v11,byteswap) 1208 + lvx v12,off64,r4 1209 + lvx v13,off80,r4 1210 + VPERM(v12,v12,v12,byteswap) 1211 + VPERM(v13,v13,v13,byteswap) 1212 + lvx v14,off96,r4 1213 + lvx v15,off112,r4 1214 + VPERM(v14,v14,v14,byteswap) 1215 + VPERM(v15,v15,v15,byteswap) 1216 + 1217 + addi r4,r4,8*16 1218 + 1219 + vxor v16,v0,v8 1220 + vxor v17,v1,v9 1221 + vxor v18,v2,v10 1222 + vxor v19,v3,v11 1223 + vxor v20,v4,v12 1224 + vxor v21,v5,v13 1225 + vxor v22,v6,v14 1226 + vxor v23,v7,v15 1227 + 1228 + li r0,1 1229 + cmpdi r6,0 1230 + addi r6,r6,128 1231 + bne 1b 1232 + 1233 + /* Work out how many bytes we have left */ 1234 + andi. r5,r5,127 1235 + 1236 + /* Calculate where in the constant table we need to start */ 1237 + subfic r6,r5,128 1238 + add r3,r3,r6 1239 + 1240 + /* How many 16 byte chunks are in the tail */ 1241 + srdi r7,r5,4 1242 + mtctr r7 1243 + 1244 + /* 1245 + * Reduce the previously calculated 1024 bits to 64 bits, shifting 1246 + * 32 bits to include the trailing 32 bits of zeros 1247 + */ 1248 + lvx v0,0,r3 1249 + lvx v1,off16,r3 1250 + lvx v2,off32,r3 1251 + lvx v3,off48,r3 1252 + lvx v4,off64,r3 1253 + lvx v5,off80,r3 1254 + lvx v6,off96,r3 1255 + lvx v7,off112,r3 1256 + addi r3,r3,8*16 1257 + 1258 + VPMSUMW(v0,v16,v0) 1259 + VPMSUMW(v1,v17,v1) 1260 + VPMSUMW(v2,v18,v2) 1261 + VPMSUMW(v3,v19,v3) 1262 + VPMSUMW(v4,v20,v4) 1263 + VPMSUMW(v5,v21,v5) 1264 + VPMSUMW(v6,v22,v6) 1265 + VPMSUMW(v7,v23,v7) 1266 + 1267 + /* Now reduce the tail (0 - 112 bytes) */ 1268 + cmpdi r7,0 1269 + beq 1f 1270 + 1271 + lvx v16,0,r4 1272 + lvx v17,0,r3 1273 + VPERM(v16,v16,v16,byteswap) 1274 + VPMSUMW(v16,v16,v17) 1275 + vxor v0,v0,v16 1276 + bdz 1f 1277 + 1278 + lvx v16,off16,r4 1279 + lvx v17,off16,r3 1280 + VPERM(v16,v16,v16,byteswap) 1281 + VPMSUMW(v16,v16,v17) 1282 + vxor v0,v0,v16 1283 + bdz 1f 1284 + 1285 + lvx v16,off32,r4 1286 + lvx v17,off32,r3 1287 + VPERM(v16,v16,v16,byteswap) 1288 + VPMSUMW(v16,v16,v17) 1289 + vxor v0,v0,v16 1290 + bdz 1f 1291 + 1292 + lvx v16,off48,r4 1293 + lvx v17,off48,r3 1294 + VPERM(v16,v16,v16,byteswap) 1295 + VPMSUMW(v16,v16,v17) 1296 + vxor v0,v0,v16 1297 + bdz 1f 1298 + 1299 + lvx v16,off64,r4 1300 + lvx v17,off64,r3 1301 + VPERM(v16,v16,v16,byteswap) 1302 + VPMSUMW(v16,v16,v17) 1303 + vxor v0,v0,v16 1304 + bdz 1f 1305 + 1306 + lvx v16,off80,r4 1307 + lvx v17,off80,r3 1308 + VPERM(v16,v16,v16,byteswap) 1309 + VPMSUMW(v16,v16,v17) 1310 + vxor v0,v0,v16 1311 + bdz 1f 1312 + 1313 + lvx v16,off96,r4 1314 + lvx v17,off96,r3 1315 + VPERM(v16,v16,v16,byteswap) 1316 + VPMSUMW(v16,v16,v17) 1317 + vxor v0,v0,v16 1318 + 1319 + /* Now xor all the parallel chunks together */ 1320 + 1: vxor v0,v0,v1 1321 + vxor v2,v2,v3 1322 + vxor v4,v4,v5 1323 + vxor v6,v6,v7 1324 + 1325 + vxor v0,v0,v2 1326 + vxor v4,v4,v6 1327 + 1328 + vxor v0,v0,v4 1329 + 1330 + .Lbarrett_reduction: 1331 + /* Barrett constants */ 1332 + addis r3,r2,.barrett_constants@toc@ha 1333 + addi r3,r3,.barrett_constants@toc@l 1334 + 1335 + lvx const1,0,r3 1336 + lvx const2,off16,r3 1337 + 1338 + vsldoi v1,v0,v0,8 1339 + vxor v0,v0,v1 /* xor two 64 bit results together */ 1340 + 1341 + /* shift left one bit */ 1342 + vspltisb v1,1 1343 + vsl v0,v0,v1 1344 + 1345 + vand v0,v0,mask_64bit 1346 + 1347 + /* 1348 + * The reflected version of Barrett reduction. Instead of bit 1349 + * reflecting our data (which is expensive to do), we bit reflect our 1350 + * constants and our algorithm, which means the intermediate data in 1351 + * our vector registers goes from 0-63 instead of 63-0. We can reflect 1352 + * the algorithm because we don't carry in mod 2 arithmetic. 1353 + */ 1354 + vand v1,v0,mask_32bit /* bottom 32 bits of a */ 1355 + VPMSUMD(v1,v1,const1) /* ma */ 1356 + vand v1,v1,mask_32bit /* bottom 32bits of ma */ 1357 + VPMSUMD(v1,v1,const2) /* qn */ 1358 + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ 1359 + 1360 + /* 1361 + * Since we are bit reflected, the result (ie the low 32 bits) is in 1362 + * the high 32 bits. We just need to shift it left 4 bytes 1363 + * V0 [ 0 1 X 3 ] 1364 + * V0 [ 0 X 2 3 ] 1365 + */ 1366 + vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ 1367 + 1368 + /* Get it into r3 */ 1369 + MFVRD(R3, v0) 1370 + 1371 + .Lout: 1372 + subi r6,r1,56+10*16 1373 + subi r7,r1,56+2*16 1374 + 1375 + lvx v20,0,r6 1376 + lvx v21,off16,r6 1377 + lvx v22,off32,r6 1378 + lvx v23,off48,r6 1379 + lvx v24,off64,r6 1380 + lvx v25,off80,r6 1381 + lvx v26,off96,r6 1382 + lvx v27,off112,r6 1383 + lvx v28,0,r7 1384 + lvx v29,off16,r7 1385 + 1386 + ld r31,-8(r1) 1387 + ld r30,-16(r1) 1388 + ld r29,-24(r1) 1389 + ld r28,-32(r1) 1390 + ld r27,-40(r1) 1391 + ld r26,-48(r1) 1392 + ld r25,-56(r1) 1393 + 1394 + blr 1395 + 1396 + .Lfirst_warm_up_done: 1397 + lvx const1,0,r3 1398 + addi r3,r3,16 1399 + 1400 + VPMSUMD(v8,v16,const1) 1401 + VPMSUMD(v9,v17,const1) 1402 + VPMSUMD(v10,v18,const1) 1403 + VPMSUMD(v11,v19,const1) 1404 + VPMSUMD(v12,v20,const1) 1405 + VPMSUMD(v13,v21,const1) 1406 + VPMSUMD(v14,v22,const1) 1407 + VPMSUMD(v15,v23,const1) 1408 + 1409 + b .Lsecond_cool_down 1410 + 1411 + .Lshort: 1412 + cmpdi r5,0 1413 + beq .Lzero 1414 + 1415 + addis r3,r2,.short_constants@toc@ha 1416 + addi r3,r3,.short_constants@toc@l 1417 + 1418 + /* Calculate where in the constant table we need to start */ 1419 + subfic r6,r5,256 1420 + add r3,r3,r6 1421 + 1422 + /* How many 16 byte chunks? */ 1423 + srdi r7,r5,4 1424 + mtctr r7 1425 + 1426 + vxor v19,v19,v19 1427 + vxor v20,v20,v20 1428 + 1429 + lvx v0,0,r4 1430 + lvx v16,0,r3 1431 + VPERM(v0,v0,v16,byteswap) 1432 + vxor v0,v0,v8 /* xor in initial value */ 1433 + VPMSUMW(v0,v0,v16) 1434 + bdz .Lv0 1435 + 1436 + lvx v1,off16,r4 1437 + lvx v17,off16,r3 1438 + VPERM(v1,v1,v17,byteswap) 1439 + VPMSUMW(v1,v1,v17) 1440 + bdz .Lv1 1441 + 1442 + lvx v2,off32,r4 1443 + lvx v16,off32,r3 1444 + VPERM(v2,v2,v16,byteswap) 1445 + VPMSUMW(v2,v2,v16) 1446 + bdz .Lv2 1447 + 1448 + lvx v3,off48,r4 1449 + lvx v17,off48,r3 1450 + VPERM(v3,v3,v17,byteswap) 1451 + VPMSUMW(v3,v3,v17) 1452 + bdz .Lv3 1453 + 1454 + lvx v4,off64,r4 1455 + lvx v16,off64,r3 1456 + VPERM(v4,v4,v16,byteswap) 1457 + VPMSUMW(v4,v4,v16) 1458 + bdz .Lv4 1459 + 1460 + lvx v5,off80,r4 1461 + lvx v17,off80,r3 1462 + VPERM(v5,v5,v17,byteswap) 1463 + VPMSUMW(v5,v5,v17) 1464 + bdz .Lv5 1465 + 1466 + lvx v6,off96,r4 1467 + lvx v16,off96,r3 1468 + VPERM(v6,v6,v16,byteswap) 1469 + VPMSUMW(v6,v6,v16) 1470 + bdz .Lv6 1471 + 1472 + lvx v7,off112,r4 1473 + lvx v17,off112,r3 1474 + VPERM(v7,v7,v17,byteswap) 1475 + VPMSUMW(v7,v7,v17) 1476 + bdz .Lv7 1477 + 1478 + addi r3,r3,128 1479 + addi r4,r4,128 1480 + 1481 + lvx v8,0,r4 1482 + lvx v16,0,r3 1483 + VPERM(v8,v8,v16,byteswap) 1484 + VPMSUMW(v8,v8,v16) 1485 + bdz .Lv8 1486 + 1487 + lvx v9,off16,r4 1488 + lvx v17,off16,r3 1489 + VPERM(v9,v9,v17,byteswap) 1490 + VPMSUMW(v9,v9,v17) 1491 + bdz .Lv9 1492 + 1493 + lvx v10,off32,r4 1494 + lvx v16,off32,r3 1495 + VPERM(v10,v10,v16,byteswap) 1496 + VPMSUMW(v10,v10,v16) 1497 + bdz .Lv10 1498 + 1499 + lvx v11,off48,r4 1500 + lvx v17,off48,r3 1501 + VPERM(v11,v11,v17,byteswap) 1502 + VPMSUMW(v11,v11,v17) 1503 + bdz .Lv11 1504 + 1505 + lvx v12,off64,r4 1506 + lvx v16,off64,r3 1507 + VPERM(v12,v12,v16,byteswap) 1508 + VPMSUMW(v12,v12,v16) 1509 + bdz .Lv12 1510 + 1511 + lvx v13,off80,r4 1512 + lvx v17,off80,r3 1513 + VPERM(v13,v13,v17,byteswap) 1514 + VPMSUMW(v13,v13,v17) 1515 + bdz .Lv13 1516 + 1517 + lvx v14,off96,r4 1518 + lvx v16,off96,r3 1519 + VPERM(v14,v14,v16,byteswap) 1520 + VPMSUMW(v14,v14,v16) 1521 + bdz .Lv14 1522 + 1523 + lvx v15,off112,r4 1524 + lvx v17,off112,r3 1525 + VPERM(v15,v15,v17,byteswap) 1526 + VPMSUMW(v15,v15,v17) 1527 + 1528 + .Lv15: vxor v19,v19,v15 1529 + .Lv14: vxor v20,v20,v14 1530 + .Lv13: vxor v19,v19,v13 1531 + .Lv12: vxor v20,v20,v12 1532 + .Lv11: vxor v19,v19,v11 1533 + .Lv10: vxor v20,v20,v10 1534 + .Lv9: vxor v19,v19,v9 1535 + .Lv8: vxor v20,v20,v8 1536 + .Lv7: vxor v19,v19,v7 1537 + .Lv6: vxor v20,v20,v6 1538 + .Lv5: vxor v19,v19,v5 1539 + .Lv4: vxor v20,v20,v4 1540 + .Lv3: vxor v19,v19,v3 1541 + .Lv2: vxor v20,v20,v2 1542 + .Lv1: vxor v19,v19,v1 1543 + .Lv0: vxor v20,v20,v0 1544 + 1545 + vxor v0,v19,v20 1546 + 1547 + b .Lbarrett_reduction 1548 + 1549 + .Lzero: 1550 + mr r3,r10 1551 + b .Lout 1552 + 1553 + FUNC_END(__crc32_vpmsum)
+167
arch/powerpc/crypto/crc32c-vpmsum_glue.c
··· 1 + #include <linux/crc32.h> 2 + #include <crypto/internal/hash.h> 3 + #include <linux/init.h> 4 + #include <linux/module.h> 5 + #include <linux/string.h> 6 + #include <linux/kernel.h> 7 + #include <asm/switch_to.h> 8 + 9 + #define CHKSUM_BLOCK_SIZE 1 10 + #define CHKSUM_DIGEST_SIZE 4 11 + 12 + #define VMX_ALIGN 16 13 + #define VMX_ALIGN_MASK (VMX_ALIGN-1) 14 + 15 + #define VECTOR_BREAKPOINT 512 16 + 17 + u32 __crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len); 18 + 19 + static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len) 20 + { 21 + unsigned int prealign; 22 + unsigned int tail; 23 + 24 + if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt()) 25 + return __crc32c_le(crc, p, len); 26 + 27 + if ((unsigned long)p & VMX_ALIGN_MASK) { 28 + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); 29 + crc = __crc32c_le(crc, p, prealign); 30 + len -= prealign; 31 + p += prealign; 32 + } 33 + 34 + if (len & ~VMX_ALIGN_MASK) { 35 + pagefault_disable(); 36 + enable_kernel_altivec(); 37 + crc = __crc32c_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); 38 + pagefault_enable(); 39 + } 40 + 41 + tail = len & VMX_ALIGN_MASK; 42 + if (tail) { 43 + p += len & ~VMX_ALIGN_MASK; 44 + crc = __crc32c_le(crc, p, tail); 45 + } 46 + 47 + return crc; 48 + } 49 + 50 + static int crc32c_vpmsum_cra_init(struct crypto_tfm *tfm) 51 + { 52 + u32 *key = crypto_tfm_ctx(tfm); 53 + 54 + *key = 0; 55 + 56 + return 0; 57 + } 58 + 59 + /* 60 + * Setting the seed allows arbitrary accumulators and flexible XOR policy 61 + * If your algorithm starts with ~0, then XOR with ~0 before you set 62 + * the seed. 63 + */ 64 + static int crc32c_vpmsum_setkey(struct crypto_shash *hash, const u8 *key, 65 + unsigned int keylen) 66 + { 67 + u32 *mctx = crypto_shash_ctx(hash); 68 + 69 + if (keylen != sizeof(u32)) { 70 + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); 71 + return -EINVAL; 72 + } 73 + *mctx = le32_to_cpup((__le32 *)key); 74 + return 0; 75 + } 76 + 77 + static int crc32c_vpmsum_init(struct shash_desc *desc) 78 + { 79 + u32 *mctx = crypto_shash_ctx(desc->tfm); 80 + u32 *crcp = shash_desc_ctx(desc); 81 + 82 + *crcp = *mctx; 83 + 84 + return 0; 85 + } 86 + 87 + static int crc32c_vpmsum_update(struct shash_desc *desc, const u8 *data, 88 + unsigned int len) 89 + { 90 + u32 *crcp = shash_desc_ctx(desc); 91 + 92 + *crcp = crc32c_vpmsum(*crcp, data, len); 93 + 94 + return 0; 95 + } 96 + 97 + static int __crc32c_vpmsum_finup(u32 *crcp, const u8 *data, unsigned int len, 98 + u8 *out) 99 + { 100 + *(__le32 *)out = ~cpu_to_le32(crc32c_vpmsum(*crcp, data, len)); 101 + 102 + return 0; 103 + } 104 + 105 + static int crc32c_vpmsum_finup(struct shash_desc *desc, const u8 *data, 106 + unsigned int len, u8 *out) 107 + { 108 + return __crc32c_vpmsum_finup(shash_desc_ctx(desc), data, len, out); 109 + } 110 + 111 + static int crc32c_vpmsum_final(struct shash_desc *desc, u8 *out) 112 + { 113 + u32 *crcp = shash_desc_ctx(desc); 114 + 115 + *(__le32 *)out = ~cpu_to_le32p(crcp); 116 + 117 + return 0; 118 + } 119 + 120 + static int crc32c_vpmsum_digest(struct shash_desc *desc, const u8 *data, 121 + unsigned int len, u8 *out) 122 + { 123 + return __crc32c_vpmsum_finup(crypto_shash_ctx(desc->tfm), data, len, 124 + out); 125 + } 126 + 127 + static struct shash_alg alg = { 128 + .setkey = crc32c_vpmsum_setkey, 129 + .init = crc32c_vpmsum_init, 130 + .update = crc32c_vpmsum_update, 131 + .final = crc32c_vpmsum_final, 132 + .finup = crc32c_vpmsum_finup, 133 + .digest = crc32c_vpmsum_digest, 134 + .descsize = sizeof(u32), 135 + .digestsize = CHKSUM_DIGEST_SIZE, 136 + .base = { 137 + .cra_name = "crc32c", 138 + .cra_driver_name = "crc32c-vpmsum", 139 + .cra_priority = 200, 140 + .cra_blocksize = CHKSUM_BLOCK_SIZE, 141 + .cra_ctxsize = sizeof(u32), 142 + .cra_module = THIS_MODULE, 143 + .cra_init = crc32c_vpmsum_cra_init, 144 + } 145 + }; 146 + 147 + static int __init crc32c_vpmsum_mod_init(void) 148 + { 149 + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) 150 + return -ENODEV; 151 + 152 + return crypto_register_shash(&alg); 153 + } 154 + 155 + static void __exit crc32c_vpmsum_mod_fini(void) 156 + { 157 + crypto_unregister_shash(&alg); 158 + } 159 + 160 + module_init(crc32c_vpmsum_mod_init); 161 + module_exit(crc32c_vpmsum_mod_fini); 162 + 163 + MODULE_AUTHOR("Anton Blanchard <anton@samba.org>"); 164 + MODULE_DESCRIPTION("CRC32C using vector polynomial multiply-sum instructions"); 165 + MODULE_LICENSE("GPL"); 166 + MODULE_ALIAS_CRYPTO("crc32c"); 167 + MODULE_ALIAS_CRYPTO("crc32c-vpmsum");
+12
arch/powerpc/include/asm/ppc-opcode.h
··· 174 174 #define PPC_INST_MFSPR_DSCR_USER_MASK 0xfc1fffff 175 175 #define PPC_INST_MTSPR_DSCR_USER 0x7c0303a6 176 176 #define PPC_INST_MTSPR_DSCR_USER_MASK 0xfc1fffff 177 + #define PPC_INST_MFVSRD 0x7c000066 178 + #define PPC_INST_MTVSRD 0x7c000166 177 179 #define PPC_INST_SLBFEE 0x7c0007a7 178 180 179 181 #define PPC_INST_STRING 0x7c00042a ··· 190 188 #define PPC_INST_WAIT 0x7c00007c 191 189 #define PPC_INST_TLBIVAX 0x7c000624 192 190 #define PPC_INST_TLBSRX_DOT 0x7c0006a5 191 + #define PPC_INST_VPMSUMW 0x10000488 192 + #define PPC_INST_VPMSUMD 0x100004c8 193 193 #define PPC_INST_XXLOR 0xf0000510 194 194 #define PPC_INST_XXSWAPD 0xf0000250 195 195 #define PPC_INST_XVCPSGNDP 0xf0000780 ··· 363 359 VSX_XX1((s), a, b)) 364 360 #define LXVD2X(s, a, b) stringify_in_c(.long PPC_INST_LXVD2X | \ 365 361 VSX_XX1((s), a, b)) 362 + #define MFVRD(a, t) stringify_in_c(.long PPC_INST_MFVSRD | \ 363 + VSX_XX1((t)+32, a, R0)) 364 + #define MTVRD(t, a) stringify_in_c(.long PPC_INST_MTVSRD | \ 365 + VSX_XX1((t)+32, a, R0)) 366 + #define VPMSUMW(t, a, b) stringify_in_c(.long PPC_INST_VPMSUMW | \ 367 + VSX_XX3((t), a, b)) 368 + #define VPMSUMD(t, a, b) stringify_in_c(.long PPC_INST_VPMSUMD | \ 369 + VSX_XX3((t), a, b)) 366 370 #define XXLOR(t, a, b) stringify_in_c(.long PPC_INST_XXLOR | \ 367 371 VSX_XX3((t), a, b)) 368 372 #define XXSWAPD(t, a) stringify_in_c(.long PPC_INST_XXSWAPD | \
+11
crypto/Kconfig
··· 437 437 gain performance compared with software implementation. 438 438 Module will be crc32c-intel. 439 439 440 + config CRYPT_CRC32C_VPMSUM 441 + tristate "CRC32c CRC algorithm (powerpc64)" 442 + depends on PPC64 443 + select CRYPTO_HASH 444 + select CRC32 445 + help 446 + CRC32c algorithm implemented using vector polynomial multiply-sum 447 + (vpmsum) instructions, introduced in POWER8. Enable on POWER8 448 + and newer processors for improved performance. 449 + 450 + 440 451 config CRYPTO_CRC32C_SPARC64 441 452 tristate "CRC32c CRC algorithm (SPARC64)" 442 453 depends on SPARC64